From fb5c3b2b91aeeb615d6d6d890491af3fdff69556 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Thu, 24 Jul 2003 14:13:19 +0000 Subject: [PATCH] Version 3.3.1. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@6 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- COPYING | 340 ++++++++++++++++++++++++++++++++ Makefile | 26 +++ README | 9 + Robots/__init__.py | 0 Robots/bkmk_rforking.py | 71 +++++++ Robots/bkmk_rforking_sub.py | 45 +++++ Robots/bkmk_rsimple.py | 192 ++++++++++++++++++ Robots/bkmk_rsimple_tos.py | 24 +++ Robots/parse_html.py | 111 +++++++++++ Storage/__init__.py | 0 Storage/bkmk_stflad.py | 128 ++++++++++++ Storage/bkmk_stpickle.py | 31 +++ Writers/__init__.py | 0 Writers/bkmk_wflad.py | 58 ++++++ Writers/bkmk_wflad_err.py | 41 ++++ Writers/bkmk_whtml.py | 47 +++++ Writers/bkmk_wtxt.py | 30 +++ bkmk-add | 9 + bkmk-add.py | 85 ++++++++ bkmk-chk | 16 ++ bkmk-koi | 45 +++++ bkmk-rsync | 9 + bkmk-set | 31 +++ bkmk-sort | 11 ++ bkmk-win | 41 ++++ bkmk2db | 23 +++ bkmk2db.py | 103 ++++------ bkmk_objects.py | 159 +++++++++++++++ bkmk_parser.py | 383 +++++++++++++----------------------- check_db.py | 186 ----------------- check_dups.py | 90 +++++++++ check_new.py | 27 --- check_old.py | 27 --- check_title.py | 75 +++++++ check_url_sub.py | 145 -------------- check_urls.py | 159 +++++++++++++++ check_urls2.py | 310 ----------------------------- chk_urls.py | 321 ------------------------------ convert_st.py | 55 ++++++ copy_err.py | 24 --- db2bkmk.py | 224 ++++++--------------- doc/ANNOUNCE | 76 +++++++ doc/ChangeLog | 0 doc/NEWS | 0 doc/README | 226 +++++++++++++++++++++ doc/TODO | 25 +++ hotexplode.pl | 180 +++++++++++++++++ koi2win.db | 14 -- mz-unescape | 6 + ns-unescape | 5 + readme | 207 ------------------- robots.py | 14 ++ set-real_title.py | 84 ++++++++ sort_db.py | 117 +++++++++++ storage.py | 14 ++ writers.py | 14 ++ 56 files changed, 2956 insertions(+), 1737 deletions(-) create mode 100644 COPYING create mode 100644 Makefile create mode 100644 README create mode 100644 Robots/__init__.py create mode 100644 Robots/bkmk_rforking.py create mode 100755 Robots/bkmk_rforking_sub.py create mode 100644 Robots/bkmk_rsimple.py create mode 100644 Robots/bkmk_rsimple_tos.py create mode 100755 Robots/parse_html.py create mode 100644 Storage/__init__.py create mode 100644 Storage/bkmk_stflad.py create mode 100644 Storage/bkmk_stpickle.py create mode 100644 Writers/__init__.py create mode 100644 Writers/bkmk_wflad.py create mode 100644 Writers/bkmk_wflad_err.py create mode 100644 Writers/bkmk_whtml.py create mode 100644 Writers/bkmk_wtxt.py create mode 100755 bkmk-add create mode 100755 bkmk-add.py create mode 100755 bkmk-chk create mode 100755 bkmk-koi create mode 100755 bkmk-rsync create mode 100755 bkmk-set create mode 100755 bkmk-sort create mode 100755 bkmk-win create mode 100755 bkmk2db create mode 100644 bkmk_objects.py delete mode 100755 check_db.py create mode 100755 check_dups.py delete mode 100755 check_new.py delete mode 100755 check_old.py create mode 100755 check_title.py delete mode 100755 check_url_sub.py create mode 100755 check_urls.py delete mode 100755 check_urls2.py delete mode 100755 chk_urls.py create mode 100755 convert_st.py delete mode 100755 copy_err.py create mode 100644 doc/ANNOUNCE create mode 100644 doc/ChangeLog create mode 100644 doc/NEWS create mode 100644 doc/README create mode 100644 doc/TODO create mode 100755 hotexplode.pl create mode 100755 mz-unescape create mode 100755 ns-unescape delete mode 100644 readme create mode 100644 robots.py create mode 100755 set-real_title.py create mode 100755 sort_db.py create mode 100644 storage.py create mode 100644 writers.py diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..eeb586b --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e6ee9e8 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ + + +VERSION=3.3.1 +EXAMPLE_SHELL=ns-unescape mz-unescape bkmk2db koi2win.db \ + bkmk-koi bkmk-win bkmk-chk bkmk-set bkmk-sort hotexplode.pl \ + check_title.py set-real_title.py bkmk-add.py bkmk-add bkmk-rsync + + +.PHONY: all +all: + @echo "Nothing to be done for \`all'" + + +distr: clean examples_distr + cd .. && tar cf - bookmarks_db | gzip -9 > bookmarks_db-$(VERSION)-`date +'%Y%m%d'`.tar.gz + + +.PHONY: examples_distr +examples_distr: + tar cf - $(EXAMPLE_SHELL) | gzip -9 > ../bookmarks_sh-$(VERSION)-`date +'%Y%m%d'`.tar.gz + rm $(EXAMPLE_SHELL) + + +.PHONY: clean +clean: + find . -name '*.py[co]' -print | xargs rm -f diff --git a/README b/README new file mode 100644 index 0000000..7baedd4 --- /dev/null +++ b/README @@ -0,0 +1,9 @@ +Bookmarks database and Internet robot. + +Author: Oleg Broytmann + +Copyright (C) 1997-2002 PhiloSoft Design. + +License: GPL. For detailed terms see COPYING. + +Documentation is in directory doc. diff --git a/Robots/__init__.py b/Robots/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Robots/bkmk_rforking.py b/Robots/bkmk_rforking.py new file mode 100644 index 0000000..9e2d5d8 --- /dev/null +++ b/Robots/bkmk_rforking.py @@ -0,0 +1,71 @@ +""" + Forking robot + + Written by BroytMann, Mar 2000 - Jun 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +try: + import cPickle + pickle = cPickle +except ImportError: + import pickle + +import sys, os +from subproc import Subprocess, RecordFile + + +check_subp = None +subp_pipe = None + +def stop_subp(log): + global check_subp, subp_pipe + if check_subp: + if log: log(" restarting hanging subprocess") + del check_subp + del subp_pipe + +def restart_subp(log): + global check_subp, subp_pipe + stop_subp(log) + + check_subp = Subprocess("%s/Robots/bkmk_rforking_sub.py" % os.path.dirname(sys.argv[0])) + subp_pipe = RecordFile(check_subp) + + +from bkmk_objects import Robot + +class robot_forking(Robot): + def check_url(self, bookmark, url_type, url_rest): + if not check_subp: + restart_subp(self.log) # Not restart, just start afresh + + try: + save_parent = bookmark.parent + bookmark.parent = None + + bookmark.tempfname = self.tempfname + subp_pipe.write_record(pickle.dumps(bookmark)) + + if check_subp.waitForPendingChar(900): # wait 15 minutes + new_b = pickle.loads(subp_pipe.read_record()) + for attr in ("error", "no_error", + "moved", "size", "md5", "real_title", + "last_tested", "last_modified", "test_time"): + if hasattr(new_b, attr): + setattr(bookmark, attr, getattr(new_b, attr)) + else: + bookmark.error = "Subprocess connection timed out" + restart_subp(self.log) + + bookmark.parent = save_parent + + except KeyboardInterrupt: + return 0 + + # Tested + return 1 + + + def stop(self): + stop_subp(None) # Stop subprocess; do not log restarting diff --git a/Robots/bkmk_rforking_sub.py b/Robots/bkmk_rforking_sub.py new file mode 100755 index 0000000..b2543f1 --- /dev/null +++ b/Robots/bkmk_rforking_sub.py @@ -0,0 +1,45 @@ +#! /usr/local/bin/python -O +""" + Check URL - subprocess + + Written by BroytMann, Mar 1999 - Aug 2002. Copyright (C) 1999-2002 PhiloSoft Design +""" + + +import sys, os, urllib + +lib_dir = os.path.normpath(os.path.dirname(sys.argv[0]) + os.sep + os.pardir) +sys.path.append(lib_dir) # for bkmk_objects.py + +try: + import cPickle + pickle = cPickle +except ImportError: + import pickle + +from subproc import RecordFile +import bkmk_rsimple + + +def run(): + bkmk_in = RecordFile(sys.stdin) + bkmk_out = RecordFile(sys.stdout) + + from m_lib.flog import openlog + log = openlog("check2.log") + from bkmk_rsimple import robot_simple + robot = robot_simple(None, log) + + while 1: + bookmark = pickle.loads(bkmk_in.read_record()) + log(bookmark.href) + url_type, url_rest = urllib.splittype(bookmark.href) + robot.check_url(bookmark, url_type, url_rest) + bkmk_out.write_record(pickle.dumps(bookmark)) + log.outfile.flush() + + log.close() + + +if __name__ == '__main__': + run() diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_rsimple.py new file mode 100644 index 0000000..7a96d25 --- /dev/null +++ b/Robots/bkmk_rsimple.py @@ -0,0 +1,192 @@ +""" + Simple, strightforward robot; guaranteed to has problems with timeouts :) + + Written by BroytMann, Mar 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +class RedirectException(Exception): + reloc_dict = { + 301: "perm.", + 302: "temp.", + "html": "html" + } + def __init__(self, errcode, newurl): + Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl)) + + +import string, os +import time, urllib +from m_lib.www.util import parse_time +from m_lib.md5wrapper import md5wrapper + + +class MyURLopener(urllib.URLopener): + # Error 302 -- relocated (temporarily) + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + if headers.has_key('location'): + newurl = headers['location'] + elif headers.has_key('uri'): + newurl = headers['uri'] + else: + newurl = "Nowhere" + raise RedirectException(errcode, newurl) + + # Error 301 -- also relocated (permanently) + http_error_301 = http_error_302 + + # Error 401 -- authentication required + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): + raise IOError, ('http error', errcode, "Authentication required ", headers) + + +urllib._urlopener = MyURLopener() + +# Some sites allow only Mozilla-compatible browsers; way to stop robots? +server_version = "Mozilla/3.0 (compatible; Python-urllib/%s)" % urllib.__version__ +urllib._urlopener.addheaders[0] = ('User-agent', server_version) + + +def get_error(msg): + if type(msg) == type(""): + return msg + + else: + s = [] + for i in msg: + s.append("'%s'" % string.join(string.split(str(i), "\n"), "\\n")) + return "(%s)" % string.join(s) + + +urllib_ftpwrapper = urllib.ftpwrapper +ftpcache_key = None + +class myftpwrapper(urllib_ftpwrapper): + def __init__(self, user, passwd, host, port, dirs): + urllib_ftpwrapper.__init__(self, user, passwd, host, port, dirs) + global ftpcache_key + ftpcache_key = (user, host, port, string.join(dirs, '/')) + +urllib.ftpwrapper = myftpwrapper + +def get_welcome(): + global ftpcache_key + _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome + ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db. + # If there are - ftpcache_key in prev line is invalid. + return _welcome + + +from bkmk_objects import Robot +from parse_html import parse_html + +class robot_simple(Robot): + def check_url(self, bookmark, url_type, url_rest): + if not self.tempfname: + self.tempfname = bookmark.tempfname + + try: + try: + self.start = int(time.time()) + url_host, url_path = urllib.splithost(url_rest) + url_path, url_tag = urllib.splittag(url_path) + + fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path), self.tempfname) + + size = 0 + last_modified = None + + if headers: + try: + size = headers["Content-Length"] + except KeyError: + pass + + try: + last_modified = headers["Last-Modified"] + except KeyError: + pass + + if last_modified: + last_modified = parse_time(last_modified) + + if last_modified: + last_modified = str(int(last_modified)) + else: + last_modified = bookmark.last_visit + + bookmark.size = size + bookmark.last_modified = last_modified + + md5 = md5wrapper() + if urllib._urlopener.type == "ftp": # Pass welcome message through MD5 + md5.update(get_welcome()) + + md5.md5file(self.tempfname) + bookmark.md5 = str(md5) + + if headers: + try: + content_type = headers["Content-Type"] + try: + content_type, charset = content_type.split(';') + content_type = content_type.strip() + charset = charset.split('=')[1].strip() + if self.log: self.log(" HTTP charset : %s" % charset) + except (ValueError, IndexError): + charset = None + if self.log: self.log(" no charset in Content-Type header") + if content_type == "text/html": + parser = parse_html(fname, charset, self.log) + title = parser.title.replace('\r', '').replace('\n', ' ').strip() + bookmark.real_title = parser.unescape(title) + if self.log: self.log(" final title : %s" % bookmark.real_title) + if parser.refresh: + refresh = parser.refresh + try: + timeout = int(refresh.split(';')[0]) + except (IndexError, ValueError): + timeout = "ERROR" + try: + url = refresh.split('=', 1)[1] + except IndexError: + url = "self" + raise RedirectException("html", "%s (%d sec)" % (url, timeout)) + except KeyError: + pass + + except IOError, msg: + if (msg[0] == "http error") and (msg[1] == -1): + bookmark.no_error = "The server did not return any header - it is not an error, actually" + else: + bookmark.error = get_error(msg) + + except EOFError: + bookmark.error = "Unexpected EOF (FTP server closed connection)" + + except RedirectException, msg: + bookmark.moved = str(msg) + + except KeyboardInterrupt: + return 0 + + finally: + self.finish_check_url(bookmark) + + # Tested + return 1 + + + def finish_check_url(self, bookmark): + # Calculate these attributes even in case of an error + if os.path.exists(self.tempfname): + size = str(os.stat(self.tempfname).st_size) + if size[-1] == 'L': + size = size[:-1] + bookmark.size = size + + start = self.start + bookmark.last_tested = str(start) + + now = int(time.time()) + bookmark.test_time = str(now - start) diff --git a/Robots/bkmk_rsimple_tos.py b/Robots/bkmk_rsimple_tos.py new file mode 100644 index 0000000..581fb69 --- /dev/null +++ b/Robots/bkmk_rsimple_tos.py @@ -0,0 +1,24 @@ +#! /usr/local/bin/python -O +""" + Simpl robot with timeoutsocket + + Written by BroytMann, Sep 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +import socket, timeoutsocket +timeoutsocket.setDefaultSocketTimeout(900) + +from bkmk_rsimple import robot_simple, get_error + +class robot_simple_tos(robot_simple): + def check_url(self, bookmark, url_type, url_rest): + try: + return robot_simple.check_url(self, bookmark, url_type, url_rest) + + except (socket.error, timeoutsocket.Timeout), msg: + bookmark.error = get_error(msg) + + self.finish_check_url(bookmark) + + return 1 diff --git a/Robots/parse_html.py b/Robots/parse_html.py new file mode 100755 index 0000000..e95dcef --- /dev/null +++ b/Robots/parse_html.py @@ -0,0 +1,111 @@ +#! /usr/local/bin/python -O +""" + HTML Parser + + Written by BroytMann, Jun 2002 - Aug 2002. Copyright (C) 1997-2002 PhiloSoft Design +""" + + +import sys +current_charset = sys.getdefaultencoding() +DEFAULT_CHARSET = "windows-1251" + + +from HTMLParser import HTMLParseError +from m_lib.www.html import HTMLParser as _HTMLParser + + +class HTMLHeadDone(Exception): pass + + +class HTMLParser(_HTMLParser): + def __init__(self, charset=None): + _HTMLParser.__init__(self) + self.charset = charset + self.meta_charset = 0 + self.title = '' + self.refresh = '' + + def end_head(self): + raise HTMLHeadDone() + + + def do_meta(self, attrs): + http_equiv = "" + content = "" + + for attrname, value in attrs: + if value: + value = value.strip() + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + + if (not self.charset) and (http_equiv == "content-type"): + try: + # extract charset from "text/html; foo; charset=UTF-8; bar;" + self.charset = content.lower().split('charset=')[1].split(';')[0] + self.meta_charset = 1 + except IndexError: + pass + + if http_equiv == "refresh": + self.refresh = content + + + def start_title(self, attrs): + self.accumulator = '' + def end_title(self): + if not self.title: # use only the first title + self.title = self.accumulator + + +def parse_html(filename, charset=None, log=None): + infile = open(filename, 'r') + parser = HTMLParser(charset) + + for line in infile: + try: + parser.feed(line) + except (HTMLParseError, HTMLHeadDone): + break + + infile.close() + + try: + parser.close() + except (HTMLParseError, HTMLHeadDone): + pass + + if not parser.charset: + title = parser.title + ascii = 1 + for c in title: + if not (32 <= ord(c) <= 127): # non-ASCII character + ascii = 0 + break + if not ascii: + parser.charset = DEFAULT_CHARSET + if parser.charset and (parser.charset <> current_charset): + try: + if parser.meta_charset: + if log: log(" META charset : %s" % parser.charset) + else: + if log: log(" charset : %s" % parser.charset) + if log: log(" title : %s" % parser.title) + parser.title = unicode(parser.title, parser.charset, "replace").encode(current_charset, "replace") + if log: log(" current charset: %s" % current_charset) + if log: log(" converted title: %s" % parser.title) + except LookupError: + if log: log(" unknown charset: `%s' or `%s'" % (parser.charset, current_charset)) + + return parser + + +if __name__ == '__main__': + import sys + parser = parse_html(sys.argv[1]) + print parser.charset + print parser.title + print parser.refresh diff --git a/Storage/__init__.py b/Storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Storage/bkmk_stflad.py b/Storage/bkmk_stflad.py new file mode 100644 index 0000000..3c89a3e --- /dev/null +++ b/Storage/bkmk_stflad.py @@ -0,0 +1,128 @@ +""" + Bookmarks storage module - FLAD (Flat ASCII Database) + special version for compatibility with old (version 1) bkmk2db + + Written by BroytMann, Feb 2000 - Mar 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +import string +from m_lib.flad import fladm +from bkmk_objects import Folder, Bookmark, Ruler, Walker + + +class storage_flad(Walker): + filename = "bookmarks_db.flad" + + def __init__(self): + self.first_object = 1 + + def root_folder(self, f): + header = string.replace(f.header, ".", ".\n") + header = string.replace(header, "<", "\n<", 3)[1:] + header_file = open("header", 'w') + header_file.write(header + "\n") + header_file.write('

%s

\n\n' % f.name) + if f.comment: header_file.write('
%s\n' % f.comment) + header_file.close() + + def start_folder(self, f, level): + self.outfile.write(""" +Level: %d +Folder: %s +AddDate: %s +Comment: %s +""" % (level, f.name, f.add_date, f.comment)) + + + def bookmark(self, b, level): + self.outfile.write(""" +Level: %d +Title: %s +URL: %s +AddDate: %s +LastVisit: %s +LastModified: %s +Comment: %s +""" % (level+1, b.name, b.href, b.add_date, b.last_visit, b.last_modified, b.comment)) + + + def ruler(self, r, level): + self.outfile.write("\nLevel: %s\nRuler: YES\n" % (level+1)) + + + def store(self, root_folder): + self.outfile = open(self.filename, 'w') + root_folder.walk_depth(self) + self.outfile.close() + + + def unindent(self, old_level, new_level): + while old_level > new_level: + old_level = old_level - 1 + del self.folder_stack[-1] + + if self.folder_stack: + self.current_folder = self.folder_stack[-1] + else: + self.current_folder = None + + def load(self): + bookmarks_db = fladm.load_from_file(self.filename, fladm.check_record, ["Level"]) + + root_folder = Folder() + self.folder_stack = [root_folder] + self.current_folder = root_folder + + header_file = open("header", 'r') + header = header_file.read() + header_file.close() + + header = string.split(header, "\n") + root_folder.header = string.join(header[:5], '') + root_folder.name = header[5][4:-5] + root_folder.comment = string.join(header[7:], '')[4:] + + save_level = 0 + got_folder = 1 # Start as if we already have one folder + + for record in bookmarks_db: + level = int(record["Level"]) + + if level == save_level: + pass + elif level == save_level + 1: + if not got_folder: + raise ValueError, "indent without folder" + elif level <= save_level - 1: + self.unindent(save_level, level) + else: + raise ValueError, "new level (%d) too big; must be %d - %d" % (level, save_level-1, save_level+1) + + save_level = level + got_folder = record.has_key("Folder") # Test here to save got_folder for next loop + + if record.has_key("URL"): + bookmark = Bookmark(record["URL"], record["AddDate"], record["LastVisit"], record["LastModified"], record["Comment"]) + bookmark.name = record["Title"] + self.current_folder.append(bookmark) + + elif record.has_key("Folder"): + folder = Folder(record["AddDate"], record["Comment"]) + folder.name = record["Folder"] + self.current_folder.append(folder) + self.folder_stack.append(folder) + self.current_folder = folder + + elif record.has_key("Ruler"): + self.current_folder.append(Ruler()) + + else: + raise KeyError, "neither \"URL\" nor \"Folder\" nor \"Ruler\" in record " + str(record) + + if save_level >= 0: + self.unindent(save_level, 0) + else: + raise ValueError, "new level (%d) too little - must be >= 0" % save_level + + return root_folder diff --git a/Storage/bkmk_stpickle.py b/Storage/bkmk_stpickle.py new file mode 100644 index 0000000..2ed61d3 --- /dev/null +++ b/Storage/bkmk_stpickle.py @@ -0,0 +1,31 @@ +""" + Bookmarks storage manager - pickle; certainly the most simple and elegant :) + + Written by BroytMann, Feb 2000 - Mar 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +try: + import cPickle + pickle = cPickle + +except ImportError: + import pickle + + +class storage_pickle: + filename = "bookmarks_db.pickle" + + + def store(self, root_folder): + outfile = open(self.filename, 'wb') + pickle.dump(root_folder, outfile, 1) + outfile.close() + + + def load(self): + infile = open(self.filename, 'rb') + root_folder = pickle.load(infile) + infile.close() + + return root_folder diff --git a/Writers/__init__.py b/Writers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Writers/bkmk_wflad.py b/Writers/bkmk_wflad.py new file mode 100644 index 0000000..dc46a83 --- /dev/null +++ b/Writers/bkmk_wflad.py @@ -0,0 +1,58 @@ +""" + Dump bookmarks db to a more readable FLAD after check_urls + + Written by BroytMann, Apr 2000 - Jun 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +import time +from bkmk_objects import Writer + + +def strftime(s): + try: + return time.strftime("%a %d %b %Y %T", time.localtime(int(s))) + except ValueError: # s is already formatted + return s + + +class writer_flad(Writer): + filename = "bookmarks_db.flad" + + def __init__(self, outfile, prune=None): + Writer.__init__(self, outfile, prune) + self.first_object = 1 + + + def start_folder(self, f, level): + self.outfile.write(""" +Level: %d +Folder: %s +AddDate: %s +Comment: %s +""" % (level, f.name, strftime(f.add_date), f.comment)) + + + def bookmark(self, b, level): + self.outfile.write(""" +Level: %d +Title: %s +URL: %s +AddDate: %s +LastVisit: %s +LastModified: %s +Comment: %s""" % (level+1, b.name, b.href, strftime(b.add_date), strftime(b.last_visit), strftime(b.last_modified), b.comment)) + + for attr_name, attr_out in (("error", "Error"), ("no_error", "NoError"), + ("moved", "Moved"), ("size", "Size"), ("md5", "Md5"), + ("real_title", "RealTitle"), ("test_time", "TestTime")): + if hasattr(b, attr_name): + self.outfile.write("\n%s: %s" % (attr_out, getattr(b, attr_name))) + + if hasattr(b, "last_tested"): + self.outfile.write("\n%s: %s" % ("LastTested", strftime(getattr(b, "last_tested")))) + + self.outfile.write("\n") + + def ruler(self, r, level): + self.outfile.write("\nLevel: %s\nRuler: YES\n" % (level+1)) diff --git a/Writers/bkmk_wflad_err.py b/Writers/bkmk_wflad_err.py new file mode 100644 index 0000000..dc02c23 --- /dev/null +++ b/Writers/bkmk_wflad_err.py @@ -0,0 +1,41 @@ +""" + Dump bookmarks db to a more readable FLAD after check_urls + + Written by BroytMann, Oct 2000 - Jun 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +import time +from bkmk_objects import Writer + + +def strftime(s): + return time.strftime("%a %d %b %Y %T", time.localtime(int(s))) + + +class writer_flad_err(Writer): + filename = "bookmarks_db.errors" + + def bookmark(self, b, level): + if not hasattr(b, "error"): + return + + self.outfile.write(""" +Level: %d +Title: %s +URL: %s +AddDate: %s +LastVisit: %s +LastModified: %s +Comment: %s""" % (1, b.name, b.href, strftime(b.add_date), strftime(b.last_visit), strftime(b.last_modified), b.comment)) + + for attr_name, attr_out in (("error", "Error"), ("no_error", "NoError"), + ("moved", "Moved"), ("size", "Size"), ("md5", "Md5"), + ("real_title", "RealTitle"), ("test_time", "TestTime")): + if hasattr(b, attr_name): + self.outfile.write("\n%s: %s" % (attr_out, getattr(b, attr_name))) + + if hasattr(b, "last_tested"): + self.outfile.write("\n%s: %s" % ("LastTested", strftime(getattr(b, "last_tested")))) + + self.outfile.write("\n") diff --git a/Writers/bkmk_whtml.py b/Writers/bkmk_whtml.py new file mode 100644 index 0000000..38918df --- /dev/null +++ b/Writers/bkmk_whtml.py @@ -0,0 +1,47 @@ +#! /usr/local/bin/python -O +""" + Convert a bkmk database back to bookmarks.html + + Written by BroytMann, Mar 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +import string + + +def dump_comment(comment): + return string.replace(comment, "
", "
\n") + + +ind_s = " "*4 + +from bkmk_objects import Writer + +class writer_html(Writer): + filename = "bookmarks.html" + + def _folder(self, f, level): + if f.comment: self.outfile.write('
%s\n' % dump_comment(f.comment)) + self.outfile.write(ind_s*level + "

\n") + + def root_folder(self, f): + header = string.replace(f.header, ".", ".\n") + header = string.replace(header, "\n") + self.outfile.write(header + "\n") + self.outfile.write('

%s

\n\n' % f.name) + self._folder(f, 0) + + def start_folder(self, f, level): + self.outfile.write(ind_s*level + '

%s

\n' % (f.add_date, f.name)) + self._folder(f, level) + + def end_folder(self, f, level): + self.outfile.write(ind_s*level + "

\n") + + def bookmark(self, b, level): + self.outfile.write(ind_s*(level+1) + '

%s\n' % (b.href, b.add_date, b.last_visit, b.last_modified, b.name)) + if b.comment: self.outfile.write('
%s\n' % dump_comment(b.comment)) + + def ruler(self, r, level): + self.outfile.write(ind_s*(level+1) + "
\n") diff --git a/Writers/bkmk_wtxt.py b/Writers/bkmk_wtxt.py new file mode 100644 index 0000000..1256114 --- /dev/null +++ b/Writers/bkmk_wtxt.py @@ -0,0 +1,30 @@ +#! /usr/local/bin/python -O +""" + Dump a bkmk database to a text file + + Written by BroytMann, Mar 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +ind_s = " "*4 + + +from bkmk_objects import Writer + +class writer_txt(Writer): + filename = "dump.txt" + + def root_folder(self, f): + self.outfile.write("Folder: %s\n" % f.name) + + def start_folder(self, f, level): + self.outfile.write(ind_s*level + "Folder: %s\n" % f.name) + + def end_folder(self, f, level): + self.outfile.write(ind_s*level + "Folder end: %s\n" % f.name) + + def bookmark(self, b, level): + self.outfile.write(ind_s*(level+1) + "Bookmark: %s\n" % b.name) + + def ruler(self, r, level): + self.outfile.write(ind_s*(level+1) + "-----\n") diff --git a/bkmk-add b/bkmk-add new file mode 100755 index 0000000..4b4568b --- /dev/null +++ b/bkmk-add @@ -0,0 +1,9 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +bkmk2db || exit 1 + +bkmk-add.py "$@" && \ +db2bkmk.py && \ +exec touch bookmarks_db.pickle # to make it more recent diff --git a/bkmk-add.py b/bkmk-add.py new file mode 100755 index 0000000..921eebe --- /dev/null +++ b/bkmk-add.py @@ -0,0 +1,85 @@ +#! /usr/local/bin/python -O +""" + Add a bookmark to the database. + + Written by BroytMann, Aug 2002. Copyright (C) 2002 PhiloSoft Design +""" + + +import sys, os, time, urllib +from bkmk_objects import Bookmark +from Robots.bkmk_rsimple import robot_simple + +import tempfile +tempfname = "bkmk-add" + tempfile.gettempprefix() + "tmp" + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "t:") + + report_stats = 1 + title = '' + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + elif _opt == '-t': + title = _arg + try: + del _opt, _arg + except NameError: + pass + + if len(args) <> 1: + sys.stderr.write("bkmk-add: too many or too few arguments\n") + sys.stderr.write("Usage: bkmk-add [-s] [-t title] url\n") + sys.exit(1) + + from storage import storage, import_storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + + if report_stats: + print "Ok" + + href = args[0] + now = int(time.time()) + bookmark = Bookmark(href, str(now), '0', '0') + bookmark.name = '' + + robot = robot_simple(tempfname, None) + url_type, url_rest = urllib.splittype(href) + + if robot.check_url(bookmark, url_type, url_rest): # get real title and last modified date + if title: # forced title + bookmark.name = title + elif hasattr(bookmark, "real_title"): + bookmark.name = bookmark.real_title + if report_stats: + sys.stdout.write("Adding %s with title '%s'\n" % (href, bookmark.name)) + root_folder.append(bookmark) + + if report_stats: + sys.stdout.write("Storing %s: " % storage.filename) + sys.stdout.flush() + + storage.store(root_folder) + + if report_stats: + print "Ok" + + + try: + os.unlink(tempfname) + except os.error: + pass + + +if __name__ == '__main__': + run() diff --git a/bkmk-chk b/bkmk-chk new file mode 100755 index 0000000..82f3fbe --- /dev/null +++ b/bkmk-chk @@ -0,0 +1,16 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +bkmk2db || exit 1 + +check_urls.py || exit 1 +check_urls.py -e || exit 1 + +# Report results +BKMK_WRITER=flad db2bkmk.py || exit 1 +check_dups.py -s -l bookmarks.err || exit 1 + +# Write results to output bookmarks files +db2bkmk.py || exit 1 +diff $HOME/.netscape/bookmarks.html bookmarks.html > diff.out diff --git a/bkmk-koi b/bkmk-koi new file mode 100755 index 0000000..8657b3c --- /dev/null +++ b/bkmk-koi @@ -0,0 +1,45 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +if [ -f ./bookmarks.html ]; then + BKMK= +elif [ -f $HOME/.netscape/bookmarks.html ]; then + BKMK=$HOME/.netscape/bookmarks.html +else + echo Cannot find bookmarks.html, aborted + exit 1 +fi + + +ns-unescape || exit 1 +bkmk2db || exit 1 +#check_db.py -l bookmarks.err || exit 1 + +#cp $HOME/Internet/WWW/header header +db2bkmk.py || exit 1 +db2bkmk.py -p "Private links" -o public-koi.tmp || exit 1 + +db2bkmk.py -t $HOME/lib/bookmarks_db/koi2win.db -o private-win.tmp || exit 1 +db2bkmk.py -t $HOME/lib/bookmarks_db/koi2win.db -p "Private links" -o public-win.tmp || exit 1 + + +replace() { + sed "s^$2^$3^" $1 > _tmp.$$ + mv _tmp.$$ $1 +} + +replace private-win.tmp http://koi.aha.ru/ http://win.aha.ru/ +replace public-win.tmp http://koi.aha.ru/ http://win.aha.ru/ +replace private-win.tmp http://koi.postman.ru/ http://win.postman.ru/ +replace public-win.tmp http://koi.postman.ru/ http://win.postman.ru/ + +towin < private-win.tmp > bookmark.ht0 +towin < public-win.tmp > bookmark.ht1 + +diff $HOME/.netscape/bookmarks.html bookmarks.html > diff.out + +if [ "$1" = "-c" ]; then + echo -n "Copying to home: " + cp -p bookmarks.html $HOME/.netscape/bookmarks.html && echo "Ok" +fi diff --git a/bkmk-rsync b/bkmk-rsync new file mode 100755 index 0000000..b819382 --- /dev/null +++ b/bkmk-rsync @@ -0,0 +1,9 @@ +#! /bin/sh + +remote_server=sun + +rsync -avz "$HOME/.netscape/bookmarks.html" "$remote_server:.netscape" # && \ + +#rsync -avz "$HOME/Internet/WWW/phd.pp.ru/Bookmarks/bkmk-koi.html" \ +# "$HOME/Internet/WWW/phd.pp.ru/Bookmarks/bkmk-win.html" \ +# "$remote_server:Internet/WWW/phd.pp.ru/Bookmarks" diff --git a/bkmk-set b/bkmk-set new file mode 100755 index 0000000..6dab5ba --- /dev/null +++ b/bkmk-set @@ -0,0 +1,31 @@ +#! /bin/sh + +umask 077 + +cd $HOME/work && \ +bkmk-koi -c || exit 1 + +mv public-koi.tmp bkmk-koi.html && \ +mv public-win.tmp bkmk-win.html && \ +chmod a+r bkmk-koi.html bkmk-win.html bookmark.ht1 || exit 1 + +echo "Splitting bookmarks..." +cd $HOME/Internet/WWW/phd.pp.ru/Bookmarks && \ +rm -rf split.koi split.win || exit 1 + +$HOME/lib/bookmarks_db/hotexplode.pl -o split.koi $HOME/work/bookmarks.html && \ +cp -p ../../nopasswd-index.html split.koi/privatelinks/index.html && \ +$HOME/lib/bookmarks_db/hotexplode.pl -o split.win $HOME/work/private-win.tmp && \ +cp -p ../../nopasswd-index.html split.win/privatelinks/index.html || exit 1 + +cp -p $HOME/work/bkmk-koi.html $HOME/work/bkmk-win.html . && \ +chmod -R a+rX . || exit 1 + +#/usr/local/htdig/bkmk.sh + +cd $HOME/work && \ +exec rm -f header bookmarks.err bookmarks_db.pickle bookmarks_db.flad private-win.tmp + +#zip -9 bkmk bkmk-koi.html bkmk-win.html && \ +#rm -f bkmk-win.html && \ +#cp -p bookmark.ht1 bkmk-win.html || exit 1 diff --git a/bkmk-sort b/bkmk-sort new file mode 100755 index 0000000..e24f8a9 --- /dev/null +++ b/bkmk-sort @@ -0,0 +1,11 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +bkmk2db || exit 1 + +BKMK_WRITER=flad sort_db.py -a && \ +BKMK_WRITER=flad sort_db.py -v && \ +BKMK_WRITER=flad sort_db.py -m && \ +BKMK_WRITER=flad sort_db.py -z && \ +BKMK_WRITER=flad sort_db.py -t || exit 1 diff --git a/bkmk-win b/bkmk-win new file mode 100755 index 0000000..c737d3d --- /dev/null +++ b/bkmk-win @@ -0,0 +1,41 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +if [ ! -f ./bookmark.htm ]; then + echo Cannot find bookmark.htm, aborted + exit 1 +fi + + +fromwin < bookmark.htm > bookmarks.html || exit 1 + +ns-unescape || exit 1 +bkmk2db || exit 1 +#check_db.py -l bookmarks.err || exit 1 + +#cp $HOME/Internet/WWW/header header +db2bkmk.py -o private-win.tmp || exit 1 +db2bkmk.py -p "Private links" -o public-win.tmp || exit 1 + +db2bkmk.py -t $HOME/lib/bookmarks_db/koi2win.db -r || exit 1 +db2bkmk.py -t $HOME/lib/bookmarks_db/koi2win.db -r -p "Private links" -o public-koi.tmp || exit 1 + +towin < bookmarks.html > bookmark.ht0 +towin < public-win.tmp > bookmark.ht1 + + +replace() { + sed "s^$2^$3^" $1 > _tmp.$$ + mv _tmp.$$ $1 +} + +replace bookmarks.html http://win.aha.ru/ http://koi.aha.ru/ +replace public-koi.tmp http://win.aha.ru/ http://koi.aha.ru/ +replace bookmarks.html http://win.postman.ru/ http://koi.postman.ru/ +replace public-koi.tmp http://win.postman.ru/ http://koi.postman.ru/ + +if [ "$1" = "-c" ]; then + echo -n "Copying to home: " + cp -p bookmarks.html $HOME/.netscape/bookmarks.html && echo "Ok" +fi diff --git a/bkmk2db b/bkmk2db new file mode 100755 index 0000000..1874e28 --- /dev/null +++ b/bkmk2db @@ -0,0 +1,23 @@ +#! /bin/sh + +PATH=$HOME/lib/bookmarks_db:$PATH + +if [ ! -f ./bookmarks_db.pickle -a ! -f ./bookmarks_db.flad ]; then + if [ -f ./bookmarks.html ]; then + BKMK= + elif [ -f $HOME/.netscape/bookmarks.html ]; then + BKMK=$HOME/.netscape/bookmarks.html + else + echo Cannot find bookmarks.html, aborted + exit 1 + fi + + bkmk2db.py $BKMK || exit 1 + +elif [ -f ./bookmarks_db.pickle -a -f ./bookmarks.html -a ./bookmarks.html -nt ./bookmarks_db.pickle ]; then + bkmk2db.py || exit 1 + +elif [ -f ./bookmarks_db.flad -a -f ./bookmarks.html -a ./bookmarks.html -nt ./bookmarks_db.flad ]; then + bkmk2db.py || exit 1 + +fi diff --git a/bkmk2db.py b/bkmk2db.py index 39ddea6..e2436f6 100755 --- a/bkmk2db.py +++ b/bkmk2db.py @@ -1,32 +1,25 @@ #! /usr/local/bin/python -O """ - Convert Netscape Navigator's bookmarks.html to FLAD database + Convert Netscape Navigator's or Mozilla's bookmarks.html to a database - Written by BroytMann, Jun 1997 - Mar 1999. Copyright (C) 1997-1999 PhiloSoft Design + Written by BroytMann, Jun 1997 - Aug 2002. Copyright (C) 1997-2002 PhiloSoft Design """ -import sys, os, stat, string -from getopt import getopt -import bkmk_parser -from formatter import AbstractFormatter, NullWriter +import sys, os +from getopt import getopt +from bkmk_parser import BkmkParser def run(): - optlist, args = getopt(sys.argv[1:], "gits") + optlist, args = getopt(sys.argv[1:], "is") - show_pbar = 1 - to_text = 0 - to_gf = 0 + show_pbar = not __debug__ report_stats = 1 for _opt, _arg in optlist: - if _opt == '-g': - to_gf = 1 if _opt == '-i': show_pbar = 0 - if _opt == '-t': - to_text = 1 if _opt == '-s': report_stats = 0 try: @@ -37,6 +30,7 @@ def run(): if args: if len(args) > 1: sys.stderr.write("bkmk2db: too many arguments\n") + sys.stderr.write("Usage: bkmk2db [-is] bookmarks.html\n") sys.exit(1) filename = args[0] @@ -44,44 +38,29 @@ def run(): else: filename = 'bookmarks.html' # good name both for DOS (bookmark.htm) and UNIX + + if report_stats: + from storage import storage_name + sys.stdout.write("Converting %s to %s: " % (filename, storage_name)) + sys.stdout.flush() + if show_pbar: show_pbar = sys.stderr.isatty() if show_pbar: try: - from tty_pbar import ttyProgressBar + from m_lib.pbar.tty_pbar import ttyProgressBar except ImportError: show_pbar = 0 if show_pbar: try: - size = os.stat(filename)[stat.ST_SIZE] + size = os.stat(filename).st_size except: print filename, ": no such file" sys.exit(1) - fmt = AbstractFormatter(NullWriter()) - if to_text: - parser = bkmk_parser.Bookmarks2Text(fmt) - elif to_gf: - parser = bkmk_parser.Bookmarks2Gadfly(fmt) - else: - parser = bkmk_parser.Bookmarks2Flad(fmt) - - - if report_stats: - str = "Converting " + filename + " to " - if to_text: - str = "text" - elif to_gf: - str = "GadFly database" - else: - str = "FLAD database" - - sys.stdout.write("Converting %s to %s: " % (filename, str)) - sys.stdout.flush() - if show_pbar: pbar = ttyProgressBar(0, size) lng = 0 @@ -92,39 +71,34 @@ def run(): else: dos_add = 0 # UNIX' and Mac's len() counts CR or LF correct - try: - f = open(filename, 'r') - except IOError, msg: - print filename, ":", msg - sys.exit(1) - header = open("header", 'w') - line_no = 0 + infile = open(filename, 'r') + parser = BkmkParser() - while 1: - line = f.readline() - if not line: - break + line_no = 0 + lng = 0 + ok = 1 + for line in infile: if show_pbar: lng = lng + len(line) + dos_add pbar.display(lng) - line = string.strip(line) + line = line.strip() line_no = line_no + 1 try: parser.feed(line) + except: + ok = 0 + break - if parser.outfile: # Write header until HTML parser start writing outfile - if header: - header.close() - header = None - else: - header.write(line + '\n') + try: + parser.close() + except: + ok = 0 - except: - break # I need total number of lines; interpreter will print traceback on exit + infile.close() if show_pbar: del pbar @@ -132,11 +106,18 @@ def run(): if report_stats: print "Ok" print line_no, "lines proceed" - print parser.urls_no, "urls found" - print parser.record_no, "records created" + print parser.urls, "urls found" + print parser.objects, "objects created" - parser.close() - f.close() + if ok: + from storage import storage + storage = storage() + storage.store(parser.root_folder) + + else: + import traceback + traceback.print_exc() + sys.exit(1) if __name__ == '__main__': diff --git a/bkmk_objects.py b/bkmk_objects.py new file mode 100644 index 0000000..2cb4600 --- /dev/null +++ b/bkmk_objects.py @@ -0,0 +1,159 @@ +""" + Objects to represent bookmarks.html structure + + Written by BroytMann, Mar 2000 - Jul 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +from UserList import UserList + +class Folder(UserList): + isFolder = 1 + isBookmark = 0 + + def __init__(self, add_date = None, comment = ''): + UserList.__init__(self) + self.comment = comment + self.add_date = add_date + + + def walk_depth(self, walker, level=0): + if hasattr(self, "header"): # root folder + prune = 0 + walker.root_folder(self) + else: + prune = walker.prune_folder(self) + if not prune: + walker.start_folder(self, level) + + if not prune: + for object in self.data: + if object.isFolder: + object.walk_depth(walker, level+1) + elif object.isBookmark: + walker.bookmark(object, level) + else: + walker.ruler(object, level) + + walker.end_folder(self, level) + + +class Bookmark: + isFolder = 0 + isBookmark = 1 + + def __init__(self, href, add_date, last_visit, last_modified, comment = ''): + self.comment = comment + self.href = href + self.add_date = add_date + self.last_visit = last_visit + self.last_modified = last_modified + + +class Ruler: + isFolder = 0 + isBookmark = 0 + + +class Walker: + """ + Interface class. Any instance that will be passed to Folder.walk_depth + may be derived from this class. It is not mandatory - unlike Java + Python does not require interface classes; but it is convenient to have + some methods predefined to no-op, in case you do not want to + provide end_folder etc. + """ + + def root_folder(self, r): + pass + + def start_folder(self, f, level): + pass + + def end_folder(self, f, level): + pass + + def bookmark(self, b, level): + pass + + def ruler(self, r, level): + pass + + def prune_folder(self, folder): + return 0 + + +class Writer(Walker): + def __init__(self, outfile, prune=None): + self.outfile = outfile + self.prune = prune + + def prune_folder(self, folder): + return self.prune == folder.name + + +class Robot: + def __init__(self, tempfname, log): + self.tempfname = tempfname + self.log = log + + def stop(self): + pass # Nothing to do on cleanup + + +# Helper class to make inverese links (nodes linked to their parent) +class InverseLinker(Walker): + def root_folder(self, r): + self.parent_stack = [r] + + def start_folder(self, f, level): + f.parent = self.parent_stack[-1] + self.parent_stack.append(f) # Push the folder onto the stack of parents + + def end_folder(self, f, level): + del self.parent_stack[-1] # Pop off the stack + + def bookmark(self, b, level): + b.parent = self.parent_stack[-1] + + def ruler(self, r, level): + r.parent = self.parent_stack[-1] + + +# Helper class to make linear represenatation of the tree +class Linear(Walker): + def root_folder(self, r): + r.linear = [r] + self.linear = r.linear + + def add_object(self, object): + self.linear.append(object) + + def start_folder(self, f, level): + self.add_object(f) + + def bookmark(self, b, level): + self.add_object(b) + + def ruler(self, r, level): + self.add_object(r) + + +# Helper - make linked linear represenatation of the tree, suitable to be stored in sequential storage +def make_linear(root_folder): + linker = InverseLinker() + root_folder.walk_depth(linker) + + linear = Linear() + root_folder.walk_depth(linear) + + +# Helper, opposite of make_linear - make a tree from the linked linear representation +def make_tree(linear): + root_folder = linear[0] + del linear[0] + + for object in linear: + object.parent.append(object) + + return root_folder diff --git a/bkmk_parser.py b/bkmk_parser.py index c3ca1b5..f396e2e 100755 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -1,125 +1,121 @@ """ - Bookmarks parsers + Parser for Netscape Navigator's bookmarks.html - Written by BroytMann, Mar 1997 - Feb 2000. Copyright (C) 1997-2000 PhiloSoft Design + Written by BroytMann, Jun 1997 - Jun 2002. Copyright (C) 1997-2002 PhiloSoft Design """ -import os, string, shutil -from htmllib import HTMLParser +import string +from m_lib.www.html import HTMLParser +from bkmk_objects import Folder, Bookmark, Ruler -class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract class) - def __init__(self, formatter, verbose=0): - HTMLParser.__init__(self, formatter, verbose) - self.urls_no = 0 # cross-reference counter - self.record_no = 1 # record counter - self.outfile = None # output file - self.level = 0 # Indentation level - self.flag_out = 0 # Is it time to flush? - self.saved_data = '' - self.saved_anchor = None - self.saved_folder = None - self.saved_ruler = None +if __debug__: + def debug(note): + print note + def dump_names(folder_stack): + l = [] + for object in folder_stack: + if object.isFolder: + l.append(object.name) + return "'" + string.join(l, "' '") + "'" - def flush(self): - if not self.outfile: - return - - record_flushed = 0 +else: + def debug(note): + pass + dump_names = debug - if self.saved_anchor: - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + self.saved_data) - self.flush_anchor() - self.saved_data = '' - record_flushed = 1 - self.saved_anchor = None - if self.saved_folder: - name, add_date, comment = self.saved_folder - self.saved_folder = (name, add_date, comment + self.saved_data) - self.flush_folder() - self.saved_data = '' - record_flushed = 1 - self.saved_folder = None +class BkmkParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) - if self.saved_ruler: - self.flush_ruler() - record_flushed = 1 - self.saved_ruler = None + self.urls = 0 + self.objects = 0 - if record_flushed: - self.record_no = self.record_no + 1 - - if self.saved_data <> '': # This may occur after ampersand - self.flag_out = 0 + self.charset = "" + self.recode = None + def handle_data(self, data): + if data: + if self.charset: + data = unicode(data, self.charset).encode() + self.accumulator = "%s%s" % (self.accumulator, data) - def close(self): - HTMLParser.close(self) + # Mozilla - get charset + def do_meta(self, attrs): + http_equiv = "" + content = "" - if self.outfile: - self.outfile.close() - - if self.level <> 0: - print "Bad HTML:
and
mismatch; level=%d" % self.level + for attrname, value in attrs: + value = string.strip(value) + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + if http_equiv == "content-type": + try: + # extract charset from "text/html; charset=UTF-8" + self.charset = content.split('=')[1] + except IndexError: + pass - def handle_data(self, data): - if not self.outfile: - return - if data and (data[0] == '&'): # Ampersand parsed by SGMLlib - self.flag_out = 0 + def start_title(self, attrs): + self.accumulator = "%s" % self.accumulator - if self.flag_out == 2: # Process comment after <DD> or <HR> - if self.saved_anchor: - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + data) - data = '' # Used + def end_title(self): + self.accumulator = "%s" % self.accumulator - if self.saved_folder: - name, add_date, comment = self.saved_folder - self.saved_folder = (name, add_date, comment + data) - data = '' # Used - self.flag_out = 0 + # Start root folder + def start_h1(self, attrs): + root_folder = Folder() + self.current_object = root_folder + self.root_folder = root_folder + self.current_folder = root_folder + self.folder_stack = [root_folder] - if self.flag_out == 1: - self.flush() + self.root_folder.header = self.accumulator + self.accumulator = '' - if data and (data[0] <> '&') and (self.flag_out == 0): - self.flag_out = 1 # Set flag (to flush data on next call) + def end_h1(self): + accumulator = self.accumulator + self.accumulator = '' - if data: - self.saved_data = self.saved_data + data + debug("Root folder name: `%s'" % accumulator) + self.root_folder.name = accumulator - def anchor_bgn(self, href, add_date, last_visit, last_modified): - self.flush() - self.anchor = (href, add_date, last_visit, last_modified) + # Start next folder + def start_h3(self, attrs): + for attrname, value in attrs: + value = string.strip(value) + if attrname == 'add_date': + add_date = value + debug("New folder...") + folder = Folder(add_date) + self.current_object = folder + self.current_folder.append(folder) + self.folder_stack.append(folder) # push new folder + self.current_folder = folder + self.objects = self.objects + 1 - def anchor_end(self): - if self.anchor: - href, add_date, last_visit, last_modified = self.anchor - self.anchor = None - self.urls_no = self.urls_no + 1 + def end_h3(self): + accumulator = self.accumulator + self.accumulator = '' - self.saved_anchor = (self.saved_data, href, add_date, last_visit, last_modified, '') - self.saved_data = '' # Used + debug("Folder name: `%s'" % accumulator) + self.current_folder.name = accumulator + # Start bookmark def start_a(self, attrs): - href = '' - add_date = '' - last_visit = '' - last_modified = '' - for attrname, value in attrs: value = string.strip(value) if attrname == 'href': @@ -131,191 +127,86 @@ class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract if attrname == 'last_modified': last_modified = value - self.anchor_bgn(href, add_date, last_visit, last_modified) + debug("Bookmark points to: `%s'" % href) + bookmark = Bookmark(href, add_date, last_visit, last_modified) + self.current_object = bookmark + self.current_folder.append(bookmark) + self.urls = self.urls + 1 + self.objects = self.objects + 1 + def end_a(self): + accumulator = self.accumulator + self.accumulator = '' - def start_h3(self, attrs): # Navigator marks folders with

tags - self.flush() - add_date = '' + debug("Bookmark name: `%s'" % accumulator) + bookmark = self.current_folder[-1] + bookmark.name = accumulator - for attrname, value in attrs: - value = string.strip(value) - if attrname == 'add_date': - add_date = value - self.saved_folder = ('', add_date, '') - self.flag_out = 0 + def flush(self): + accumulator = self.accumulator + if accumulator: + self.accumulator = '' - def end_h3(self): # End of folder - name, add_date, comment = self.saved_folder - self.saved_folder = (name + self.saved_data, add_date, comment) - self.saved_data = '' # Used + current_object = self.current_object + current_object.comment = current_object.comment + accumulator + debug("Comment: `%s'" % current_object.comment) def start_dl(self, attrs): self.flush() - if not self.outfile: # We are starting output after 1st
tag to skip header - self.open_outfile() - - self.level = self.level + 1 + do_dt = start_dl + # End of folder def end_dl(self): self.flush() - self.level = self.level - 1 - - - def do_dd(self, attrs): - if self.outfile: - self.flag_out = 2 # Set flag to signal "comment starting" - - - def do_br(self, attrs): - if self.outfile: - self.saved_data = self.saved_data + "
" # Add
... - self.flag_out = 0 # ...and next line of comment to saved comment - - - def do_hr(self, attrs): - if self.outfile: - self.flush() - self.saved_ruler = 1 - - - def handle_charref(self, name): - if self.outfile: - self.flag_out = 0 - self.saved_data = "%s&%c" % (self.saved_data, chr(name)) - - - def handle_entityref(self, name): - if self.outfile: - self.flag_out = 0 - if self.entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon - x = ';' + debug("End folder") + debug("Folder stack: %s" % dump_names(self.folder_stack)) + if self.folder_stack: + del self.folder_stack[-1] # pop last folder + if self.folder_stack: + self.current_folder = self.folder_stack[-1] else: - x = '' - self.saved_data = "%s&%s%s" % (self.saved_data, name, x) - - - def open_outfile(self): - self.outfile = open("bookmarks.tmp", 'w') + debug("FOLDER STACK is EMPTY!!! (1)") + else: + debug("FOLDER STACK is EMPTY!!! (2)") + self.current_object = None -class Bookmarks2Text(BookmarksParser): - def flush_anchor(self): - self.outfile.write(" "*(self.level-1) + str(self.saved_anchor) + '\n') - + def close(self): + HTMLParser.close(self) + if self.folder_stack: + raise ValueError, "wrong folder stack: %s" % self.folder_stack - def flush_folder(self): - self.outfile.write(" "*(self.level-1) + str(self.saved_folder) + '\n') + def do_dd(self, attrs): + pass - def flush_ruler(self): - self.outfile.write(" "*(self.level-1) + "----------\n") + do_p = do_dd - def __del__(self): - shutil.copy("bookmarks.tmp", "bookmarks.txt") - os.unlink("bookmarks.tmp") + # Start ruler + def do_hr(self, attrs): + self.flush() + debug("Ruler") + self.current_folder.append(Ruler()) + self.current_object = None + self.objects = self.objects + 1 -class Bookmarks2Flad(BookmarksParser): - def __init__(self, formatter, verbose=0): - BookmarksParser.__init__(self, formatter, verbose) - self.flush_record = 0 + # BR in comment + def do_br(self, attrs): + self.accumulator = "%s
" % self.accumulator - def flush(self): - if not self.outfile: - return + # Allow < in the text + def unknown_starttag(self, tag, attrs): + self.accumulator = "%s<%s>" % (self.accumulator, tag) - record_flushed = 0 - if self.saved_anchor or self.saved_folder or self.saved_ruler or self.saved_data: - if self.flush_record: - self.outfile.write('\n') - else: - self.flush_record = 1 - - BookmarksParser.flush(self) - - - def flush_anchor(self): - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.outfile.write("""Level: %d -Title: %s -URL: %s -AddDate: %s -LastVisit: %s -LastModified: %s -Comment: %s -""" % (self.level, name, href, add_date, last_visit, last_modified, comment)) - - def flush_folder(self): - name, add_date, comment = self.saved_folder - self.outfile.write("""Level: %d -Folder: %s -AddDate: %s -Comment: %s -""" % (self.level, name, add_date, comment)) - - def flush_ruler(self): - self.outfile.write("Level: %s\nRuler: YES\n" % self.level) - - - def __del__(self): - shutil.copy("bookmarks.tmp", "bookmarks.db") - os.unlink("bookmarks.tmp") - - -class Bookmarks2Gadfly(BookmarksParser): - def open_outfile(self): - import gadfly - connection = gadfly.gadfly() - connection.startup("bookmarks", ".") - self.connection = connection - - cursor = connection.cursor() - cursor.execute("""create table bookmarks ( - rec_no integer, - level integer, - title varchar, - DATA varchar, - add_date integer, - last_visit integer, - last_modified integer, - comment varchar - )""") - self.outfile = cursor - - self.template = """insert into bookmarks - (rec_no, level, title, DATA, add_date, last_visit, last_modified, comment) - values (?, ?, ?, ?, ?, ?, ?, ?)""" - - - def __del__(self): - self.connection.commit() - - - def flush_anchor(self): - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.outfile.execute(self.template, - (self.record_no, self.level, name, href, - add_date, last_visit, last_modified, comment) - ) - - def flush_folder(self): - name, add_date, comment = self.saved_folder - self.outfile.execute(self.template, - (self.record_no, self.level, name, "Folder", - add_date, '', '', comment) - ) - - def flush_ruler(self): - self.outfile.execute(self.template, - (self.record_no, self.level, '', "Ruler", - '', '', '', '') - ) + # Do not allow unknow end tags + def unknown_endtag(self, tag): + raise NotImplementedError("Unknow end tag `%s'" % tag) diff --git a/check_db.py b/check_db.py deleted file mode 100755 index 32472e9..0000000 --- a/check_db.py +++ /dev/null @@ -1,186 +0,0 @@ -#! /usr/local/bin/python -O -""" - Test FLAD database for: duplicate URLs, too big indent, incorrect record - format, spare keys. - - Written by BroytMann, Jun 1997 - Feb 2000. Copyright (C) 1997-2000 PhiloSoft Design -""" - -import sys, string -from getopt import getopt -from copy import _copy_dict - -import fladm - - -def error(err_str): - global errors_found, report_stats - if errors_found == 0: - if report_stats: - print "errors found" - - errors_found = errors_found + 1 - sys.stderr.write("%s\n" % err_str) - - if logfile: - logfile.write("%s\n" % err_str) - - -def check_key(record_no, record, key, allow_empty=1): - if not record.has_key(key): - error("Expected `%s' in record %d -- %s" % (key, record_no, str(record))) - return - - if not allow_empty and not record[key]: - error("Empty key `%s' in record %d -- %s" % (key, record_no, str(record))) - - del record[key] - -def check_date(record_no, record, key): - if not record.has_key(key): - error("Expected `%s' in record %d -- %s" % (key, record_no, str(record))) - else: - try: - _date = string.atoi(record[key]) - except string.atoi_error: - error("Bad `%s' format in record %d -- %s" % (key, record_no, str(record))) - - del record[key] - -def check_empty(record_no, record): - if record <> {}: - error("Spare keys in record %d -- %s" % (record_no, str(record))) - -def check_url(record_no, record): - # I am not testing here check_url("Level") because it is impossible - # to come here without "Level" key - fladm.check_record has to reject - # entire database if there is record without this "must key". - # If someone adds record without "Level" manually - it is serious error - # and the following line raise exception. - del record["Level"] - - check_key(record_no, record, "Title") - check_key(record_no, record, "URL") - check_key(record_no, record, "Comment") - - check_date(record_no, record, "AddDate") - check_date(record_no, record, "LastVisit") - check_date(record_no, record, "LastModified") - - check_empty(record_no, record) - -def check_folder(record_no, record): - # Read comment above - in the beginning of check_url() - del record["Level"] - - check_key(record_no, record, "Folder") - check_key(record_no, record, "Comment") - - check_date(record_no, record, "AddDate") - check_empty(record_no, record) - -def check_ruler(record_no, record): - # Read comment above - in the beginning of check_url() - del record["Level"] - - if not record.has_key("Ruler"): - error("No `Ruler' in record %d -- %s" % (record_no, str(record))) - else: - if record["Ruler"] <> "YES": # Impossible: ruler saying it is not ruler - error("Ruler saying it is not ruler in record %d -- %s" % (record_no, str(record))) - del record["Ruler"] - - check_empty(record_no, record) - - -def run(): - optlist, args = getopt(sys.argv[1:], "l:s") - - global errors_found, report_stats, logfile - report_stats = 1 - - logfile = None - logfname = None - - for _opt, _arg in optlist: - if _opt == '-l': - logfname = _arg - if _opt == '-s': - report_stats = 0 - try: - del _opt, _arg - except NameError: - pass - - if len(args) > 1: - sys.stderr.write("check_db: too many arguments\n") - sys.exit(1) - - - if logfname: - logfile = open(logfname, 'w') - - if report_stats: - sys.stdout.write("Loading: ") - sys.stdout.flush() - - bookmarks_db = fladm.load_from_file("bookmarks.db", fladm.check_record, ["Level"]) - - if report_stats: - print "Ok" - sys.stdout.write("Testing: ") - sys.stdout.flush() - - record_no = 0 - save_level = 1 - got_folder = 1 # Start as if we already have one folder - errors_found = 0 - - URL_d = {} # Create hash table full of URLs - - for record in bookmarks_db: - record_no = record_no + 1 - level = string.atoi(record["Level"]) - - if record.has_key("URL"): - if URL_d.has_key(record["URL"]): - error("Duplicate URL (rec. %d, 1st at rec. %d): %s" % (record_no, URL_d[record["URL"]], str(record["URL"]))) - else: - URL_d[record["URL"]] = record_no - - check_url(record_no, _copy_dict(record)) - - elif record.has_key("Folder"): - check_folder(record_no, _copy_dict(record)) - - elif record.has_key("Ruler"): - check_ruler(record_no, _copy_dict(record)) - - else: - raise KeyError, "neither \"URL\" nor \"Folder\" nor \"Ruler\" in record " + str(record) - - if got_folder: - if (level > save_level + 1): - error("Indent %d too big (want %d at rec. %d), record: %s" % (level, save_level, record_no, str(record))) - else: - if (level > save_level): - error("Indent %d without folder (rec. %d), record: %s" % (level, record_no, str(record))) - - save_level = level - got_folder = record.has_key("Folder") # Test here to save got_folder for next loop - - # End of loop - - if logfname: - logfile.close() - - if report_stats: - print record_no, "records tested" - if errors_found == 0: - print "Ok (no errors found)" - else: - print "%d errors found" % errors_found - - -if __name__ == '__main__': - run() diff --git a/check_dups.py b/check_dups.py new file mode 100755 index 0000000..1a947dd --- /dev/null +++ b/check_dups.py @@ -0,0 +1,90 @@ +#! /usr/local/bin/python -O +""" + Check duplicate URLs in the bookmarks database + + Written by BroytMann, Jun 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +import sys + + +log_file = None + +def report_dup(href, object_no): + s = "Duplicate URL: %s (first at rec. %d)" % (href, object_no) + print s + + if log_file: + log_file.write("%s\n" % s) + + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "sl:") + + report_stats = 1 + global log_file + log_filename = None + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + if _opt == '-l': + log_filename = _arg + try: + del _opt, _arg + except NameError: + pass + + if report_stats: + print "BroytMann check_dups, Copyright (C) 2000 PhiloSoft Design" + + if args: + sys.stderr.write("check_urls: too many arguments\n") + sys.stderr.write("Usage: check_urls [-s] [-l logfile]\n") + sys.exit(1) + + if log_filename: + log_file = open(log_filename, 'w') + + from storage import storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + from bkmk_objects import make_linear + make_linear(root_folder) + objects = len(root_folder.linear) + + if report_stats: + print "Ok" + + + dup_dict = {} + + for object_no in range(objects): + object = root_folder.linear[object_no] + + if object.isBookmark: + href = object.href + if dup_dict.has_key(href): + report_dup(href, dup_dict[href]) + else: + dup_dict[href] = object_no + + + if log_filename: + log_file.close() + + if report_stats: + print "Ok" + print objects, "objects passed" + + +if __name__ == '__main__': + run() diff --git a/check_new.py b/check_new.py deleted file mode 100755 index 5cd0a64..0000000 --- a/check_new.py +++ /dev/null @@ -1,27 +0,0 @@ -#! /usr/local/bin/python -O -""" - Test FLAD database for old records - - Written by BroytMann, Feb 2000. Copyright (C) 2000 PhiloSoft Design -""" - - -import fladm -from time import time - -now = time() -thrashold = 2*24*3600 # 2 days - - -def run(): - bookmarks_db = fladm.load_from_file("bookmarks.db", fladm.check_record, ["Level"]) - - for record in bookmarks_db: - if record.has_key("URL"): - add_date = int(record["AddDate"]) - if now - add_date < thrashold: - print "New URL:", record["URL"] - - -if __name__ == '__main__': - run() diff --git a/check_old.py b/check_old.py deleted file mode 100755 index 15a0990..0000000 --- a/check_old.py +++ /dev/null @@ -1,27 +0,0 @@ -#! /usr/local/bin/python -O -""" - Test FLAD database for old records - - Written by BroytMann, Feb 2000. Copyright (C) 2000 PhiloSoft Design -""" - - -import fladm -from time import time - -now = time() -thrashold = 2*30*24*3600 # 2 months - - -def run(): - bookmarks_db = fladm.load_from_file("bookmarks.db", fladm.check_record, ["Level"]) - - for record in bookmarks_db: - if record.has_key("URL"): - last_visit = int(record["LastVisit"]) - if now - last_visit > thrashold: - print "Old URL:", record["URL"] - - -if __name__ == '__main__': - run() diff --git a/check_title.py b/check_title.py new file mode 100755 index 0000000..950c6e2 --- /dev/null +++ b/check_title.py @@ -0,0 +1,75 @@ +#! /usr/local/bin/python -O +""" + Check and show URLs in the bookmarks database where name <> real title + + Written by BroytMann, Jul 2002 - Aug 2002. Copyright (C) 2002 PhiloSoft Design +""" + + +import sys + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "s") + + report_stats = 1 + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + try: + del _opt, _arg + except NameError: + pass + + if report_stats: + print "BroytMann check_title, Copyright (C) 2002 PhiloSoft Design" + + if args: + sys.stderr.write("check_title: too many arguments\n") + sys.stderr.write("Usage: check_title [-s]\n") + sys.exit(1) + + from storage import storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + from bkmk_objects import make_linear + make_linear(root_folder) + objects = len(root_folder.linear) + + if report_stats: + print "Ok" + + + for object_no in range(objects): + object = root_folder.linear[object_no] + + if object.isBookmark: + if hasattr(object, "moved") or hasattr(object, "error"): + continue + + if hasattr(object, "real_title"): + if object.name <> object.real_title: + print object.href + print object.name + print object.real_title + print + else: + print object.href + print object.name + print "NO REAL TITLE!!!" + print + + + if report_stats: + print objects, "objects passed" + + +if __name__ == '__main__': + run() diff --git a/check_url_sub.py b/check_url_sub.py deleted file mode 100755 index 6c301df..0000000 --- a/check_url_sub.py +++ /dev/null @@ -1,145 +0,0 @@ -#! /usr/local/bin/python -O -""" - Check URL - subprocess - - Written by BroytMann, Mar 1999 - Feb 2000. Copyright (C) 1999-2000 PhiloSoft Design -""" - - -import sys, os, stat, string, time -import urllib, www_util - -import cPickle -pickle = cPickle -from subproc import RecordFile - -from md5wrapper import md5wrapper - - -ftpcache_key = None -def myftpwrapper(user, passwd, host, port, dirs): - global ftpcache_key - ftpcache_key = (user, host, port, string.joinfields(dirs, '/')) - return _ftpwrapper(user, passwd, host, port, dirs) - -_ftpwrapper = urllib.ftpwrapper -urllib.ftpwrapper = myftpwrapper - -def get_welcome(): - global ftpcache_key - _welcome = urllib._urlopener.ftpcache[ftpcache_key].ftp.welcome - ftpcache_key = None # I am assuming there are no duplicate ftp URLs in db. - # If there are - ftpcache_key in prev line is invalid. - return _welcome - - -class RedirectException(Exception): - reloc_dict = { - 301: "perm", - 302: "temp" - } - def __init__(self, errcode, newurl): - Exception.__init__(self, "(%s.) to %s" % (self.reloc_dict[errcode], newurl)) - - -class MyURLopener(urllib.URLopener): - # Error 302 -- relocated (temporarily) - def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): - if headers.has_key('location'): - newurl = headers['location'] - elif headers.has_key('uri'): - newurl = headers['uri'] - else: - newurl = "Nowhere" - raise RedirectException(errcode, newurl) - - # Error 301 -- also relocated (permanently) - http_error_301 = http_error_302 - - # Error 401 -- authentication required - def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): - raise IOError, ('http error', errcode, "Authentication required ", headers) - - -def get_error(msg): - if type(msg) == type(""): - return msg - - else: - s = [] - for i in msg: - s.append("'%s'" % string.join(string.split(str(i), "\n"), "\\n")) - return "(%s)" % string.join(s) - -def check_url(record): - try: - now = str(int(time.time())) - url_type, url_rest = urllib.splittype(record["URL"]) - url_host, url_path = urllib.splithost(url_rest) - url_path, url_tag = urllib.splittag(url_path) - - tempfname = record["TEMPFILE"] - del record["TEMPFILE"] - - fname, headers = urllib.urlretrieve("%s://%s%s" % (url_type, url_host, url_path), tempfname) - - last_modified = None - record["Size"] = str(os.stat(tempfname)[stat.ST_SIZE]) - - if headers: - try: - last_modified = headers["Last-Modified"] - except KeyError: - last_modified = None - - if last_modified: - last_modified = www_util.parse_time(last_modified) - - if last_modified: - last_modified = str(int(last_modified)) - else: - last_modified = record["LastVisit"] - - record["LastModified"] = last_modified - - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(get_welcome()) - - md5.md5file(tempfname) - record["MD5"] = str(md5) - - except IOError, msg: - if (msg[0] == "http error") and (msg[1] == -1): - record["NoError"] = "The server did not return any header - it is not an error, actually" - else: - record["Error"] = get_error(msg) - - except EOFError: - record["Error"] = "Unexpected EOF (FTP server closed connection)" - - except RedirectException, msg: - record["Moved"] = str(msg) - - # Mark this even in case of error - record["LastTested"] = now - - -def run(): - urllib._urlopener = MyURLopener() - - # Some sites allow only Mozilla-compatible browsers; way to stop robots? - server_version = "Mozilla/3.0 (compatible; Python-urllib/%s)" % urllib.__version__ - urllib._urlopener.addheaders[0] = ('User-agent', server_version) - - rec_in = RecordFile(sys.stdin) - rec_out = RecordFile(sys.stdout) - - while 1: - record = pickle.loads(rec_in.read_record()) - check_url(record) - rec_out.write_record(pickle.dumps(record)) - - -if __name__ == '__main__': - run() diff --git a/check_urls.py b/check_urls.py new file mode 100755 index 0000000..497c081 --- /dev/null +++ b/check_urls.py @@ -0,0 +1,159 @@ +#! /usr/local/bin/python -O +""" + Robot interface - check URLs from bookmarks database + + Written by BroytMann, Mar 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +import sys, os, urllib, tempfile +tempfname = "check_urls" + tempfile.gettempprefix() + "tmp" + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "ise") + + show_pbar = 1 + report_stats = 1 + only_errors = 0 + + for _opt, _arg in optlist: + if _opt == '-i': + show_pbar = 0 + if _opt == '-s': + report_stats = 0 + if _opt == '-e': + only_errors = 1 + try: + del _opt, _arg + except NameError: + pass + + if report_stats: + print "BroytMann check_urls, Copyright (C) 1997-2002 PhiloSoft Design" + + if args: + sys.stderr.write("check_urls: too many arguments\n") + sys.stderr.write("Usage: check_urls [-ise]\n") + sys.exit(1) + + if show_pbar: + show_pbar = sys.stderr.isatty() + + if show_pbar: + try: + from m_lib.pbar.tty_pbar import ttyProgressBar + except ImportError: + show_pbar = 0 + + from m_lib.flog import makelog, openlog + if only_errors: + log = openlog("check.log") + log("chk_urls restarted for errors") + if report_stats: + print "chk_urls restarted for errors" + else: + log = makelog("check.log") + log("check_urls started") + if report_stats: + print " check_urls: normal start" + + from storage import storage + storage = storage() + + from robots import robot + robot = robot(tempfname, log) + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + from bkmk_objects import make_linear + make_linear(root_folder) + objects = len(root_folder.linear) + + if report_stats: + print "Ok" + + if report_stats: + if only_errors: + s = "Rechecking errors: " + else: + s = "Checking: " + sys.stdout.write(s) + sys.stdout.flush() + + if show_pbar: + pbar = ttyProgressBar(0, objects) + + urls_no = 0 + object_count = 0 + size = 0 + + checked = {} + rcode = 1 + + for object_no in range(objects): + if show_pbar: + pbar.display(object_no+1) + + object = root_folder.linear[object_no] + object_count = object_count + 1 + + if object.isBookmark: + if only_errors: + if hasattr(object, "error"): + delattr(object, "error") + else: + continue + + if checked.has_key(object.href): + log("Already checked %s" % object.href) + old_object = root_folder.linear[checked[object.href]] + for attr_name in ("last_visit", "last_modified", + "error", "no_error", "moved", "size", "md5", "real_title", + "last_tested", "test_time"): + if hasattr(old_object, attr_name): + setattr(object, attr_name, getattr(old_object, attr_name)) + else: + url_type, url_rest = urllib.splittype(object.href) + log("Checking %s:%s" % (url_type, url_rest)) + rcode = robot.check_url(object, url_type, url_rest) + + if rcode: + checked[object.href] = object_no + urls_no = urls_no + 1 + try: + size = size + int(object.size) + except (AttributeError, TypeError, ValueError): + pass # Some object does not have a size :( + else: + log("Interrupted by user (^C)") + break + robot.stop() + + if show_pbar: + del pbar + + if report_stats: + print "Ok" + print object_count, "objects passed" + print urls_no, "URLs checked" + print size, "bytes eaten" + + storage.store(root_folder) + + if rcode: + log("check_urls finished ok") + log.close() + + try: + os.unlink(tempfname) + except os.error: + pass + + +if __name__ == '__main__': + run() diff --git a/check_urls2.py b/check_urls2.py deleted file mode 100755 index 73a91f9..0000000 --- a/check_urls2.py +++ /dev/null @@ -1,310 +0,0 @@ -#! /usr/local/bin/python -O -""" - For every URL in the FLAD database get info from the Net - and store info in check.db - - Version 2.0 - Written by BroytMann, Aug 1997 - Mar 1999. Copyright (C) 1997-1999 PhiloSoft Design -""" - - -import sys, os, stat, string, time -from getopt import getopt - -import urllib, tempfile -from copy import _copy_dict - -import cPickle -pickle = cPickle - -import fladm, fladc, shutil -from flog import makelog, openlog - - -os.environ["PATH"] = ".:" + os.environ["PATH"] -from subproc import Subprocess, RecordFile - - -def set_checkpoint(rec_no): - cpfile = open("check.dat", 'w') - cpfile.write("# chk_urls checkpoint file\n") - cpfile.write("Size: %d\n" % db_stat[stat.ST_SIZE]) - cpfile.write("MTime: %d\n" % db_stat[stat.ST_MTIME]) - cpfile.write("Record: %d" % rec_no) - cpfile.close() - -def get_checkpoint(): - try: - cpfile = fladc.load_file("check.dat") - if (string.atoi(cpfile["Size"]) <> db_stat[stat.ST_SIZE]) or \ - (string.atoi(cpfile["MTime"]) <> db_stat[stat.ST_MTIME]): - return -3 - - return string.atoi(cpfile["Record"]) - - except IOError: # No such file - return -1 - - except KeyError: # No such key in checkpoint file - return -2 - - except string.atoi_error: # Wrong numeric format - return -2 - - return 0 - -def start(db_name, report_stats): - start_recno = get_checkpoint() - if start_recno < 0: - if start_recno == -1: - log = makelog("check.log") - log("chk_urls started") - if report_stats: - print " chk_urls: normal start" - - elif start_recno == -2: - log = openlog("check.log") - log("chk_urls started") - log(" invalid checkpoint file, checkpoint ignored") - if report_stats: - print " chk_urls: invalid checkpoint file, checkpoint ignored" - - elif start_recno == -3: - log = makelog("check.log") - log("chk_urls started") - log(" bookmarks.db changed, checkpoint ignored") - if report_stats: - print " chk_urls: bookmarks.db changed, checkpoint ignored" - - else: - raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno) - - start_recno = 0 - - elif start_recno == 0: - raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno) - - else: # start_recno > 0 - if os.path.exists("check.db"): - if not os.path.exists("check.old"): - shutil.copy("check.db", "check.old") - db_name = "check.db" - - log = openlog("check.log") - log("chk_urls started") - log(" found valid checkpoint file, continue") - if report_stats: - print " chk_urls: found valid checkpoint file, continue" - - else: - log = makelog("check.log") - log("chk_urls started") - log(" valid checkpoint, but no check.db file, restarting") - if report_stats: - print " chk_urls: valid checkpoint, but no check.db file, restarting" - start_recno = 0 - - return start_recno, db_name, log - - -tempfname = "check_urls" + tempfile.gettempprefix() + ".tmp" - - -check_subp = None -subp_pipe = None - -def restart_subp(log, report_stats): - global check_subp, subp_pipe - if check_subp: - log(" restarting hanging subprocess") - if report_stats: - print " chk_urls: restarting hanging subprocess" - del check_subp - del subp_pipe - - check_subp = Subprocess("check_url_sub.py") - subp_pipe = RecordFile(check_subp) - - -def check_url(record, log, report_stats): - try: - record["TEMPFILE"] = tempfname - subp_pipe.write_record(pickle.dumps(record)) - - if check_subp.waitForPendingChar(900): # wait 15 minutes - rec = pickle.loads(subp_pipe.read_record()) - del record["TEMPFILE"] - for key in rec.keys(): - record[key] = rec[key] - else: - restart_subp(log, report_stats) - del record["TEMPFILE"] - record["Error"] = "Subprocess connection timed out" - - except KeyboardInterrupt: - return 0 - - return 1 - - -def run(): - optlist, args = getopt(sys.argv[1:], "ise") - - show_pbar = 1 - report_stats = 1 - only_errors = 0 - db_name = "bookmarks.db" - - for _opt, _arg in optlist: - if _opt == '-i': - show_pbar = 0 - if _opt == '-s': - report_stats = 0 - if _opt == '-e': - only_errors = 1 - try: - del _opt, _arg - except NameError: - pass - - if report_stats: - print "BroytMann check_urls, Copyright (C) 1997-1999 PhiloSoft Design" - - if args: - if len(args) > 1: - sys.stderr.write("chk_urls: too many arguments\n") - sys.exit(1) - else: - db_name = args[0] - - if show_pbar: - show_pbar = sys.stderr.isatty() - - if show_pbar: - try: - from tty_pbar import ttyProgressBar - except ImportError: - show_pbar = 0 - - global db_stat, log - db_stat = os.stat(db_name) - - if only_errors: - start_recno = 0 - db_name = "check.db" - log = openlog("check.log") - log("chk_urls restarted for errors") - else: - start_recno, db_name, log = start(db_name, report_stats) - - if report_stats: - sys.stdout.write("Loading %s: " % db_name) - sys.stdout.flush() - - bookmarks_db = fladm.load_from_file(db_name, fladm.check_record, ["Level"]) - bookmarks_dbstore = bookmarks_db - - if only_errors: - bookmarks_db = filter(lambda r: r.has_key("Error") and r["Error"][:5] <> "Moved", bookmarks_db) - - if report_stats: - print "Ok" - - db_len = len(bookmarks_db) - if db_len == 0: - print "Database empty" - sys.exit(0) - - if start_recno >= db_len: - _s = "start_recno (%d) >= db_len (%d), restarting" % (start_recno, db_len) - log(" " + _s) - if report_stats: - print " chk_urls: " + _s - del _s - start_recno = 0 - - if report_stats: - if only_errors: - s = "Rechecking errors: " - else: - s = "Checking: " - sys.stdout.write(s) - sys.stdout.flush() - - if show_pbar: - save_stats = report_stats - report_stats = 0 - pbar = ttyProgressBar(0, db_len) - - urls_no = 0 - record_count = 0 - start_time = time.time() - - rcode = 1 - restart_subp(log, report_stats) # Not restart, just start afresh - checked_dict = {} # Dictionary of checked URLs, mapped to records number - - for record_no in range(start_recno, db_len): - if show_pbar: - pbar.display(record_no+1) - - record = bookmarks_db[record_no] - record_count = record_count + 1 - - if only_errors: - del record["Error"] - - if record.has_key("URL"): - url = record["URL"] - if checked_dict.has_key(url): - log("Already checked %s" % url) - level = record["Level"] - comment = record["Comment"] - bookmarks_db[record_no] = _copy_dict(bookmarks_db[checked_dict[url]]) - bookmarks_db[record_no]["Level"] = level - bookmarks_db[record_no]["Comment"] = comment - else: - log("Checking %s" % url) - rcode = check_url(record, log, report_stats) - if rcode: - current_time = time.time() - if current_time - start_time >= 300: # Save checkpoint and database every 5 min - bookmarks_dbstore.store_to_file("check.db") - set_checkpoint(record_no) - log.flush() - start_time = current_time - urls_no = urls_no + 1 - checked_dict[url] = record_no - else: - log("Interrupted by user (^C)") - break - - if show_pbar: - del pbar - report_stats = save_stats - - if report_stats: - print "Ok" - print record_count, "records checked" - print urls_no, "URLs checked" - - bookmarks_dbstore.store_to_file("check.db") - - if rcode: - log("chk_urls finished ok") - log.close() - - urllib.urlcleanup() - if os.path.exists(tempfname): - os.unlink(tempfname) - - if rcode: - if os.path.exists("check.dat"): - os.unlink("check.dat") - else: - set_checkpoint(record_no) - sys.exit(1) - - -if __name__ == '__main__': - run() diff --git a/chk_urls.py b/chk_urls.py deleted file mode 100755 index 8bc3ddd..0000000 --- a/chk_urls.py +++ /dev/null @@ -1,321 +0,0 @@ -#! /usr/local/bin/python -O -""" - For every URL in the FLAD database get info from the Net - and store info in check.db - - Written by BroytMann, Aug-Oct 1997. Copyright (C) 1997 PhiloSoft Design -""" - -import sys, os, string, stat, shutil, time -from getopt import getopt -import tempfile - -import urllib -from urllib import URLopener, splittype - -from md5wrapper import md5wrapper -from flog import makelog, openlog -import fladm, fladc, www_util - - -# Shortcut for basic usage -_urlopener = None - -def urlopen(url): - global _urlopener - if not _urlopener: - _urlopener = URLopener() - return _urlopener.open(url) - -def urlretrieve(url, filename=None): - global _urlopener - if not _urlopener: - _urlopener = URLopener() - if filename: - return _urlopener.retrieve(url, filename) - else: - return _urlopener.retrieve(url) - -def urlcleanup(): - if _urlopener: - _urlopener.cleanup() - - -_key = None - -def myftpwrapper(user, passwd, host, port, dirs): - global _key - _key = (user, host, port, string.joinfields(dirs, '/')) - return _ftpwrapper(user, passwd, host, port, dirs) - -_ftpwrapper = urllib.ftpwrapper -urllib.ftpwrapper = myftpwrapper - -def get_welcome(): - global _key - _welcome = _urlopener.ftpcache[_key].ftp.welcome - _key = None # I am assuming there are no duplicate ftp URLs in db. If there are - _key in prev line is invalid - return _welcome - - -def set_checkpoint(rec_no): - cpfile = open("check.dat", 'w') - cpfile.write("# chk_urls checkpoint file\n") - cpfile.write("Size: %d\n" % db_stat[stat.ST_SIZE]) - cpfile.write("MTime: %d\n" % db_stat[stat.ST_MTIME]) - cpfile.write("Record: %d" % rec_no) - cpfile.close() - -def get_checkpoint(): - try: - cpfile = fladc.load_file("check.dat") - if (string.atoi(cpfile["Size"]) <> db_stat[stat.ST_SIZE]) or \ - (string.atoi(cpfile["MTime"]) <> db_stat[stat.ST_MTIME]): - return -3 - - return string.atoi(cpfile["Record"]) - - except IOError: # No such file - return -1 - - except KeyError: # No such key in checkpoint file - return -2 - - except string.atoi_error: # Wrong numeric format - return -2 - - return 0 - - -tempfname = tempfile.gettempprefix() + "check.tmp" - - -def get_error(msg): - if type(msg) == type(""): - return msg - - else: - s = "" - for i in msg: - if s <> "": - s = s + ", " - x = string.join(string.split(str(i), "\n"), "\\n") - s = s + "'%s'" % x - return "(" + s + ")" - -def check_url(record, url_type, url_rest): - - now = str(int(time.time())) - - try: - fname, headers = urlretrieve(url_type + ':' + url_rest, tempfname) - - last_modified = None - - record["Size"] = str(os.stat(tempfname)[stat.ST_SIZE]) - - if headers: - try: - last_modified = headers["Last-Modified"] - except KeyError: - last_modified = None - - if last_modified: - last_modified = www_util.parse_time(last_modified) - - if last_modified: - last_modified = str(int(last_modified)) - else: - last_modified = record["LastVisit"] - - record["LastModified"] = last_modified - - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(get_welcome()) - - md5.md5file(tempfname) - record["MD5"] = str(md5) - - except IOError, msg: - record["Error"] = get_error(msg) - - except EOFError: - record["Error"] = "Unexpected EOF (FTP server closed connection)" - - except KeyboardInterrupt: - return 0 - - # Mark this even in case of error - record["LastTested"] = now - - return 1 - - -def run(): - optlist, args = getopt(sys.argv[1:], "is") - - show_pbar = 1 - report_stats = 1 - db_name = "bookmarks.db" - - for _opt, _arg in optlist: - if _opt == '-i': - show_pbar = 0 - if _opt == '-s': - report_stats = 0 - try: - del _opt, _arg - except NameError: - pass - - if report_stats: - print "BroytMann chk_urls, Copyright (C) 1997-1998 PhiloSoft Design" - - if args: - sys.stderr.write("chk_urls: too many arguments\n") - sys.exit(1) - - if show_pbar: - show_pbar = sys.stderr.isatty() - - if show_pbar: - try: - from tty_pbar import ttyProgressBar - except ImportError: - show_pbar = 0 - - global db_stat, log - db_stat = os.stat("bookmarks.db") - - start_recno = get_checkpoint() - if start_recno < 0: - if start_recno == -1: - log = makelog("check.log") - log("chk_urls started") - if report_stats: - print " chk_urls: normal start" - - elif start_recno == -2: - log = openlog("check.log") - log("chk_urls started") - log(" invalid checkpoint file, checkpoint ignored") - if report_stats: - print " chk_urls: invalid checkpoint file, checkpoint ignored" - - elif start_recno == -3: - log = makelog("check.log") - log("chk_urls started") - log(" bookmarks.db changed, checkpoint ignored") - if report_stats: - print " chk_urls: bookmarks.db changed, checkpoint ignored" - - else: - raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno) - - start_recno = 0 - - elif start_recno == 0: - raise RuntimeError, "wrong get_checkpoint() return: `%s'" % str(start_recno) - - else: # start_recno > 0 - if os.path.exists("check.db"): - if not os.path.exists("check.old"): - shutil.copy("check.db", "check.old") - db_name = "check.db" - - log = openlog("check.log") - log("chk_urls started") - log(" found valid checkpoint file, continue") - if report_stats: - print " chk_urls: found valid checkpoint file, continue" - - else: - log = makelog("check.log") - log("chk_urls started") - log(" valid checkpoint, but no check.db file, restarting") - if report_stats: - print " chk_urls: valid checkpoint, but no check.db file, restarting" - start_recno = 0 - - if report_stats: - sys.stdout.write("Loading %s: " % db_name) - sys.stdout.flush() - - bookmarks_db = fladm.load_from_file(db_name, fladm.check_record, ["Level"]) - db_len = len(bookmarks_db) - - if report_stats: - print "Ok" - - if start_recno >= db_len: - _s = "start_recno (%d) >= db_len (%d), restarting" % (start_recno, db_len) - log(" " + _s) - if report_stats: - print " chk_urls: " + _s - del _s - start_recno = 0 - - if report_stats: - sys.stdout.write("Checking: ") - sys.stdout.flush() - - if show_pbar: - pbar = ttyProgressBar(0, db_len) - - urls_no = 0 - record_count = 0 - start_time = time.time() - - rcode = 1 - for record_no in range(start_recno, db_len): - if show_pbar: - pbar.display(record_no+1) - - record = bookmarks_db[record_no] - record_count = record_count + 1 - - if record.has_key("URL"): - url_type, url_rest = splittype(record["URL"]) - log("Checking %s:%s" % (url_type, url_rest)) - rcode = check_url(record, url_type, url_rest) - if rcode: - current_time = time.time() - if current_time - start_time >= 300: # Save checkpoint and database every 5 min - bookmarks_db.store_to_file("check.db") - set_checkpoint(record_no) - log.flush() - start_time = current_time - urls_no = urls_no + 1 - else: - log("Interrupted by user (^C)") - break - - if show_pbar: - del pbar - - if report_stats: - print "Ok" - print record_count, "records checked" - print urls_no, "URLs checked" - - bookmarks_db.store_to_file("check.db") - - if rcode: - log("chk_urls finished ok") - log.close() - - urlcleanup() - if os.path.exists(tempfname): - os.unlink(tempfname) - - if rcode: - if os.path.exists("check.dat"): - os.unlink("check.dat") - else: - set_checkpoint(record_no) - sys.exit(1) - - -if __name__ == '__main__': - run() diff --git a/convert_st.py b/convert_st.py new file mode 100755 index 0000000..bba98e9 --- /dev/null +++ b/convert_st.py @@ -0,0 +1,55 @@ +#! /usr/local/bin/python -O +""" + Convert a bkmk database to a different storage. + + Written by BroytMann, Apr 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design +""" + + +import sys + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "s") + + report_stats = 1 + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + try: + del _opt, _arg + except NameError: + pass + + if len(args) <> 1: + sys.stderr.write("convert_st: too many or too few arguments\n") + sys.stderr.write("Usage: convert_st [-s] new_storage\n") + sys.exit(1) + + from storage import storage, import_storage + storage = storage() + + new_storage = import_storage(args[0]) + new_storage = new_storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + + if report_stats: + print "Ok" + sys.stdout.write("Converting to %s: " % new_storage.filename) + sys.stdout.flush() + + new_storage.store(root_folder) + + if report_stats: + print "Ok" + + +if __name__ == '__main__': + run() diff --git a/copy_err.py b/copy_err.py deleted file mode 100755 index 4aa6e35..0000000 --- a/copy_err.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/local/bin/python -O -""" - Test FLAD database for old records - - Written by BroytMann, Feb 2000. Copyright (C) 2000 PhiloSoft Design -""" - - -import fladm - - -def run(): - bookmarks_db = fladm.load_from_file("bookmarks.db", fladm.check_record, ["Level"]) - errors = fladm.Flad_WithMustKeys(fladm.check_record, ["Level"]) - - for record in bookmarks_db: - if record.has_key("Error"): - errors.append(record) - - errors.store_to_file("errors.db") - - -if __name__ == '__main__': - run() diff --git a/db2bkmk.py b/db2bkmk.py index cc2bb82..ed5d518 100755 --- a/db2bkmk.py +++ b/db2bkmk.py @@ -1,164 +1,37 @@ #! /usr/local/bin/python -O """ - Convert FLAD database back to bookmarks.html suitable for Netscape Navigator + Convert a bkmk database back to bookmarks.html (or other format defined by writer) - Written by BroytMann, Jun 1997 - Mar 1999. Copyright (C) 1997-1999 PhiloSoft Design + Written by BroytMann, Mar 2000 - Aug 2002. Copyright (C) 2000-2002 PhiloSoft Design """ -import sys, os, string, shutil -from getopt import getopt -import fladm - - -def write(str): - if private_level == 0: # Put in public all except private folder - public_html.write(str) - private_html.write(str) - - -def unindent(old_level, new_level): - while old_level > new_level: - old_level = old_level - 1 - write(" "*old_level + "

\n") - - -def gen_html(bookmarks_db, show_pbar, report_stats): - global pbar, record_no, urls_no, public_html, private_html, private_level - - shutil.copy("header", "public.html") - shutil.copy("header", "private.html") - - public_html = open("public.html", 'a') - private_html = open("private.html", 'a') - - record_no = 0 - urls_no = 0 - - save_level = 0 - got_folder = 1 # Start as if we already have one folder - private_level = 0 - - for record in bookmarks_db: - record_no = record_no + 1 - - if show_pbar: - pbar.display(record_no) - - level = string.atoi(record["Level"]) - - if level == save_level: - pass - elif level == save_level + 1: - if got_folder: - write(" "*(level - 1) + "

\n") - else: - raise ValueError, "indent without folder" - elif level <= save_level - 1: - unindent(save_level, level) - else: - raise ValueError, "new level (%d) too big; must be %d - %d" % (level, save_level-1, save_level+1) - - save_level = level - got_folder = record.has_key("Folder") # Test here to save got_folder for next loop - - if private_level == save_level: - private_level = 0 # We've returned to saved private level - private folder is over - - if record.has_key("URL"): - write(" "*level + '

%s\n' % (record["URL"], record["AddDate"], record["LastVisit"], record["LastModified"], record["Title"])) - urls_no = urls_no + 1 - - elif record.has_key("Folder"): - # Dirty hacks here - if (record["Folder"] == "Private links") and (private_level == 0): - private_level = save_level # We found private folder - save its level - - if record["Folder"] == "All the rest - Unclassified": - write(" "*level + '

%s

\n' % (record["AddDate"], record["Folder"])) - else: - write(" "*level + '

%s

\n' % (record["AddDate"], record["Folder"])) - - elif record.has_key("Ruler"): - write(" "*level + "
\n") - - else: - raise KeyError, "neither \"URL\" nor \"Folder\" nor \"Ruler\" in record " + str(record) - - if record.has_key("Comment") and (record["Comment"] <> ''): - write("
%s\n" % string.join(string.split(record["Comment"], "
"), "
\n")) - - - if save_level >= 0: - unindent(save_level, 0) - else: - raise ValueError, "new level (%d) too little - must be >= 0" % save_level - - public_html.close() - private_html.close() - - if show_pbar: - del pbar - - if report_stats: - print "Ok" - - -def translate(bookmarks_db, transldb_name, transl, show_pbar, report_stats): - global pbar, record_no, urls_no, public_html, private_html, private_level - - new_ext = str(transl) - os.rename("public.html", "public." + new_ext) - os.rename("private.html", "private." + new_ext) - - transl_d = {} - transl_db = fladm.load_from_file(transldb_name, fladm.check_record, ["URL1", "URL2"], [""]) - # This prevents any other key to appear in transl.db ^ - - # Generate translation dictionary (hash table) - if transl == 1: - for record in transl_db: - transl_d[record["URL1"]] = record["URL2"] - elif transl == 2: - for record in transl_db: - transl_d[record["URL2"]] = record["URL1"] - else: - raise ValueError, "transl (%d) must be 1 or 2" % transl - - del transl_db # Save few bytes of memory - transl_k = transl_d.keys() - - # Translate URLs - for record in bookmarks_db: - if record.has_key("URL") and (record["URL"] in transl_k): - record["URL"] = transl_d[record["URL"]] - - gen_html(bookmarks_db, show_pbar, report_stats) - - new_ext = str(3 - transl) # Translate 1 to 2, or 2 to 1 - os.rename("public.html", "public." + new_ext) - os.rename("private.html", "private." + new_ext) +import sys def run(): - global pbar, record_no, urls_no, public_html, private_html, private_level - - optlist, args = getopt(sys.argv[1:], "ist:r") + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "sp:o:t:r") - show_pbar = 1 report_stats = 1 + prune = None + + from writers import writer + output_filename = writer.filename - transldb_name = "" # dictionary translation; default is no translation transl = 0 + transl_name = "" # dictionary translation; default is no translation for _opt, _arg in optlist: - if _opt == '-i': - show_pbar = 0 if _opt == '-s': report_stats = 0 + if _opt == '-p': + prune = _arg + if _opt == '-o': + output_filename = _arg if _opt == '-t': - transldb_name = _arg transl = 1 + transl_name = _arg if _opt == '-r': transl = 2 try: @@ -168,52 +41,65 @@ def run(): if args: sys.stderr.write("db2bkmk: too many arguments\n") + sys.stderr.write("Usage: db2bkmk [-s] [-p prune_folder] [-o filename] [-t trans] [-r]\n") sys.exit(1) - if show_pbar: - show_pbar = sys.stderr.isatty() - - if show_pbar: - try: - from tty_pbar import ttyProgressBar - except ImportError: - show_pbar = 0 + from storage import storage + storage = storage() if report_stats: - sys.stdout.write("Loading: ") + sys.stdout.write("Loading %s: " % storage.filename) sys.stdout.flush() - bookmarks_db = fladm.load_from_file("bookmarks.db", fladm.check_record, ["Level"]) + root_folder = storage.load() if report_stats: print "Ok" - sys.stdout.write("Converting FLAD database to bookmarks.html: ") + sys.stdout.write("Writing %s: " % output_filename) sys.stdout.flush() - if show_pbar: - pbar = ttyProgressBar(0, len(bookmarks_db)) - - gen_html(bookmarks_db, show_pbar, report_stats) if transl: - if report_stats: - sys.stdout.write("Translating: ") - sys.stdout.flush() + new_ext = str(transl) + transl_d = {} + + from m_lib.flad import fladm + transl_db = fladm.load_from_file(transl_name, fladm.check_record, ["URL1", "URL2"], [""]) + # This prevents any other key to appear in transl_db ^ + + # Generate translation dictionary (hash table) + if transl == 1: + for record in transl_db: + transl_d[record["URL1"]] = record["URL2"] + elif transl == 2: + for record in transl_db: + transl_d[record["URL2"]] = record["URL1"] + else: + raise ValueError, "transl (%d) must be 1 or 2" % transl - if report_stats and show_pbar: # Display bar only without "-i"; - # with "-s" skip it (one bar already - # displayed, and it is enough) - pbar = ttyProgressBar(0, len(bookmarks_db)) + del transl_db # Save few bytes of memory - else: - show_pbar = 0 + from bkmk_objects import Walker + class Transl(Walker): + def __init__(self, transl_d): + self.transl_d = transl_d + + def bookmark(self, b, level): + href = b.href + transl_d = self.transl_d - translate(bookmarks_db, transldb_name, transl, show_pbar, report_stats) + if transl_d.has_key(href): + b.href = transl_d[href] + root_folder.walk_depth(Transl(transl_d)) + + + outfile = open(output_filename, 'w') + root_folder.walk_depth(writer(outfile, prune)) + outfile.close() if report_stats: - print record_no, "records proceed" - print urls_no, "urls created" + print "Ok" if __name__ == '__main__': diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE new file mode 100644 index 0000000..b4974dc --- /dev/null +++ b/doc/ANNOUNCE @@ -0,0 +1,76 @@ + + Bookmarks Database and Internet Robot + +WHAT IS IT + There is a set of classes, libraries, programs and plugins I use to +manipulate my bookmarks.html. I like Netscape Navigator, but I need more +features, so I write and maintain these programs for my needs. I need to +extend Navigator's "What's new" feature (Navigator 4 calls it "Update +bookmarks"). + + +WHAT'S NEW in version 3.3.1 + New shell scripts in the example area. + + +WHAT'S NEW in version 3.3.0 + Required Python 2.2. + HTML parser. If the protocol is HTTP, and there is Content-Type header, and +content type is text/html, the object is parsed to extract its title; if the +Content-Type header has charset, or if the HTML has with charset, the +title is converted from the given charset to the default charset. The object is +also parsed to extract tag with redirect. + + +WHAT'S NEW in version 3.0 + Complete rewrite from scratch. Created mechanism for pluggable storage +managers, writers (DB dumpers/exporters) and robots. + + +WHERE TO GET + Master site: http://phd.pp.ru/Software/Python/#bookmarks_db + + Faster mirrors: http://phd.by.ru/Software/Python/#bookmarks_db + http://phd2.chat.ru/Software/Python/#bookmarks_db + + +AUTHOR + Oleg Broytmann + +COPYRIGHT + Copyright (C) 1997-2002 PhiloSoft Design + +LICENSE + GPL + +STATUS + Storage managers: pickle, FLAD (Flat ASCII Database). + Writers: HTML, text, FLAD (full database or only errors). + Robots (URL checker): simple, simple+timeoutscoket, forking. + +TODO + Parse downloaded file and get some additional information out of headers + and parsed data - title, for example. Or redirects using . + (Partially done - now extracting title). + + Documentation. + + Merge "writers" to storage managers. + New storage managers: shelve, SQL, ZODB, MetaKit. + Robots (URL checkers): threading, asyncore-based. + Aliases in bookmarks.html. + + Configuration file for configuring defaults - global defaults for the system + and local defaults for subsystems. + + Ruleset-based mechanisms to filter out what types of URLs to check: checking + based on URL schema, host, port, path, filename, extension, etc. + + Detailed reports on robot run - what's old, what's new, what was moved, + errors, etc. + WWW-interface to the report. + + Bigger database. Multiuser database. Robot should operate on a part of + the DB. + WWW-interface to the database. User will import/export/edit bookmarks, + schedule robot run, etc. diff --git a/doc/ChangeLog b/doc/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/doc/NEWS b/doc/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/doc/README b/doc/README new file mode 100644 index 0000000..c616975 --- /dev/null +++ b/doc/README @@ -0,0 +1,226 @@ + + Bookmarks Database and Internet Robot + + Here is a set of classes, libraries, programs and plugins I use to +manipulate my bookmarks.html. I like Netscape Navigator, but I need more +features, so I write and maintain these programs for my needs. I need to +extend Navigator's "What's new" feature (Navigator 4 named it "Update +bookmarks"). + + These programs are intended to run as follows. +1. bkmk2db converts bookmarks.html to bookmarks.db. +2. check_urls (Internet robot) runs against bookmarks.db, checks every URL and + saves results in check.db. +3. db2bkmk converts bookmarks.db back to bookmarks.html. + Then I use this bookmarks file and... +4. bkmk2db converts bookmarks.html to bookmarks.db. +5. check_urls (Internet robot) runs against bookmarks.db, checks every URL and + saves results in check.db (old file copied to check.old). +6. (An yet unnamed program) will compare check.old with check.db and generate +detailed report. For example: + this URL is unchanged + this URL is changed + this URL is unavailable due to: host not found... + +AUTHOR + Oleg Broytmann + +COPYRIGHT and LEGAL ISSUES + Copyright (C) 1997-2002 PhiloSoft Design +All sources protected by GNU GPL. Programs are provided "as-is", without +any kind of warranty. All usual blah-blah-blah. + + #include + +LICENSE + GPL + +------------------------------ environ ------------------------------ + + These programs use the following environment variables: + +BKMK_STORAGE - use this storage plugin; default is pickle storage. +BKMK_WRITER - use this writer plugin; default is HTML writer. +BKMK_ROBOT - use this robot plugin; default is forking robot. + + +------------------------------ bkmk2db ------------------------------ + NAME + bkmk2db.py - script to convert bookmarks.html to a database. + + SYNOPSIS + bkmk2db.py [-is] [/path/to/bookmarks.html] + + DESCRIPTION + bkmk2db.py splits given file (or ./bookmarks.html) into a database + (using storage plugin). + + Options: + -i + Inhibit progress bar. Default is to display progress bar if + stderr.isatty() + + -s + Suppress output of statistics at the end of the program. Default + is to write how many lines the program read and how many URLs + parsed. Also suppress some messages during run. + + BUGS + Aliases are not supported (yet). + + +------------------------------ db2bkmk ------------------------------ + NAME + db2bkmk.py - script to reconstruct bookmarks.html back from a + database. + + SYNOPSIS + db2bkmk.py [-s] [-p prune] [-o output_file] [-t dict.db [-r]] + + DESCRIPTION + db2bkmk.py reads bookmarks.db and creates two HTML files - + + Options: + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + created. Also suppress some messages during run. + + -p prune + Prune bookmarks tree if encounter a folder with this name. + + -o output_file + Put output into different file. + + -t dict.db + For most tasks, if someone need to process bookmarks.db in a + regular way (for example, replace all "gopher://gopher." with + "http://www."), it is easy to write special program, processing + every DB record. But there are cases when someone need to process + bookmarks.db in a non-regular way: one URL must be changed + in one way, another URL - in second way, etc. The -t option allows to + use external dictionary for such translation. The dictionary itself + is FLAD database, where every record have two keys - URL1 and + URL2. With -t option in effect, db2bkmk generates translated + version of bookmarks.html, where every URL1 is replaced with + corresponding URL2 from the translation dictionary. (See koi2win.db + for example of translation dictionary) + + -r + Reverse the effect of -t option - translate from URL2 to URL1. + + +------------------------------ check_urls ----------------------------- + NAME + check_urls.py - Internet robot + + SYNOPSIS + check_urls.py [-ise] + + DESCRIPTION + check_urls.py runs a robot plugin against every URL. Additional field + Error appeared in records that have not been checked by some reasons; + the reason is a content of Error field. + + Options: + -i + Inhibit progress bar. Default is to display progress bar if + stderr.isatty() + + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + checked. Also suppress some messages during run. + + -e + Check only those URLs that has "error" mark in DB. + + BUGS + Ugly mechanism to catch welcome message from FTP server (from urllib). + + +------------------------------ convert_st ----------------------------- + NAME + convert_st.py - convert between storages. + + SYNOPSIS + conver_st.py [-s] new_format. + + DESCRIPTION + convert_st.py converts the database from one format to another. + + Options: + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + checked. Also suppress some messages during run. + + +------------------------------ sort_db ----------------------------- + NAME + sort_db.py - sort DB. + + SYNOPSIS + sort_db.py [-savmr] + + DESCRIPTION + sort_db.py sorts the database according to one of the time + fields and dump sorted list of bookmarks. + + Options: + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + checked. Also suppress some messages during run. + + -a + Sort by add_date. + + -v + Sort by last_visit. + + -m + Sort by last_modified. + + -r + Reverse sort. + + +------------------------------ check_dups ----------------------------- + NAME + check_dups.py - check duplicated URLs in the DB. + + SYNOPSIS + check_dups.py [-s] [-l logfile] + + DESCRIPTION + check_dups.py prints out a list of duplicated URLs (if any). + + Options: + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + checked. Also suppress some messages during run. + + -l logfile + Save the list of dups in the logfile. + + +------------------------------ bkmk-add ----------------------------- + NAME + bkmk-add - add a bookmark to the DB. + + SYNOPSIS + bkmk-add [-s] [-t title] url + + DESCRIPTION + bkmk-add adds a bookmark to the DB. + + Options: + -s + Suppress output of statistics at the end of the program. Default is + to write how many records the program proceed and how many URLs + checked. Also suppress some messages during run. + + -t title + Force title of the bookmark. diff --git a/doc/TODO b/doc/TODO new file mode 100644 index 0000000..887e5a9 --- /dev/null +++ b/doc/TODO @@ -0,0 +1,25 @@ + Parse downloaded file and get some additional information out of headers + and parsed data - title, for example. Or redirects using . + (Partially done - now extracting title). + + Documentation. + + Merge "writers" to storage managers. + New storage managers: shelve, SQL, ZODB, MetaKit. + Robots (URL checkers): threading, asyncore-based. + Aliases in bookmarks.html. + + Configuration file for configuring defaults - global defaults for the system + and local defaults for subsystems. + + Ruleset-based mechanisms to filter out what types of URLs to check: checking + based on URL schema, host, port, path, filename, extension, etc. + + Detailed reports on robot run - what's old, what's new, what was moved, + errors, etc. + WWW-interface to the report. + + Bigger database. Multiuser database. Robot should operate on a part of + the DB. + WWW-interface to the database. User will import/export/edit bookmarks, + schedule robot run, etc. diff --git a/hotexplode.pl b/hotexplode.pl new file mode 100755 index 0000000..4f3e845 --- /dev/null +++ b/hotexplode.pl @@ -0,0 +1,180 @@ +#!/usr/bin/perl + +# hotexplode -- a program for "exploding" a xmosaic hotlist or Netscape +# bookmark file into a hierarchial multi-page structure. +# acb 60 Chs 3162 + +# revision history: +# v1.0: 1-3-1996: initial version + +$date = `date`; + +# customise below + +# header: some arbitrary HTML text which is appended below the title and +# above the hotlist data + +$header = < +
+This hotlist was generated with +hotexplode +on $date. +

+WARNING: The inclusion of a link to a page on +this hotlist is not an indication of the maintainer's +approval of or agreement with its content. +

+
+
+Please DO NOT bookmark this page. Bookmark +main page instead. +Any other page in the hierarchy may disappear at any time. +
+FOO + +$footer = < +FOO + +# which directory shall contain the hotlist? + +$outdir = "hotlist"; + + +# end of customisable portion + +require "getopts.pl"; + +&Getopts("o:t:v"); + +$outdir = $opt_o if $opt_o; + + +# seek forward to the title +while (<>) { + if (/([^\<\>]*)<\/TITLE>/) { + $title = $1; + last; + } +} + +$title = $opt_t if $opt_t; + +# seek forward to the start of the list + + +while (<>) { + if(/<UL>/) { warn "Detected xmosaic hotlist format\n" if $opt_v; + &parse_mosaic_hotlist($outdir, $title); last; } + if(/<DL>/) { warn "Detected Netscape bookmark format\n" if $opt_v; + &parse_netscape_bookmarks($outdir, $title); last; } +} + +# parse an xmosaic hotlist +# exit when we meet a </UL> +# arguments: pathname of directory in which output is to be placed, +# title + +sub parse_mosaic_hotlist { + # we write the file at the very end, because (I think) filehandles do + # not have local scope, and this is recursive + local($prefix, $title) = @_; + local($result) = "<HTML><HEAD><TITLE>$title \ +\n

$title

\n $header \n
\n
    "; + + warn "Creating $prefix...\n" if $opt_v; + + # create the directory, if needed + mkdir($prefix, 0755) unless -d $prefix; + + while (<>) { + last if (/<\/UL>/); + + if(/
  • *]*>([^\<]*)<\/A>/) { + # + # A URL + # + local($url,$name) = ($1, $2); + $result = $result."
  • $name \n"; + next; + } + if(/
  • (.*)$/) { + # + # we've got a live one here... + # + local($subtitle)=local($filename)=$1; + $filename =~ tr/0-9A-Za-z//cd; + $filename =~ tr/A-Z/a-z/; + <>; # eat the "
      " line. + $result .= "
    • ${subtitle}\n"; + &parse_mosaic_hotlist("${prefix}/${filename}", "${title}:${subtitle}"); + next; + } + + } + + $result = $result . $footer . ""; + # write it to a file + open(FILE, ">${prefix}/index.html"); + print FILE $result; + close(FILE); +} + +# parse a Netscape bookmarks list +# exit when we meet a
+# arguments: pathname of directory in which output is to be placed, +# subtitle + +sub parse_netscape_bookmarks { + # we write the file at the very end, because (I think) filehandles do + # not have local scope, and this is recursive + local($prefix, $title) = @_; + local($result) = "$title \ +\n

$title

\n $header \n
\n
"; + + warn "Creating $prefix...\n" if $opt_v; + + # create the directory, if needed + mkdir($prefix, 0755) unless -d $prefix; + + while (<>) { + last if (/<\/DL>/); + if (/
]*>([^\<]*)<\/H3>/) { + # + # a nested list + # + local($subtitle)=$1; + local($filename)=$1; + $filename =~ tr/0-9A-Za-z//cd; + $filename =~ tr/A-Z/a-z/; + # parse the description here + local($desc)=""; + while(<>) { + last if (/
/); + $desc = $desc . $_; + } + $result = $result . "
${subtitle}\n"; + unless("$desc" eq "") { $result = $result . $desc; } + &parse_netscape_bookmarks("${prefix}/${filename}", + "${title}:${subtitle}"); + next; + } + if (/
]*>([^\<]*)<\/A>/) { + # + # A URL + # + local($url, $name) = ($1, $2); + $result = $result."
$name \n"; + next; + } + $result = $result . $_; + } + $result = $result . $footer . ""; + # write it to a file + open(FILE, ">${prefix}/index.html"); + print FILE $result; + close(FILE); +} + + diff --git a/koi2win.db b/koi2win.db index 75433b8..e69de29 100644 --- a/koi2win.db +++ b/koi2win.db @@ -1,14 +0,0 @@ -URL1: http://www.xland.ru:8088/tel_koi/owa/tel.intro -URL2: http://www.xland.ru:8088/tel_win/owa/tel.intro - -URL1: http://meteo.infospace.ru/koi/moscow/html/r_index.htm -URL2: http://meteo.infospace.ru/win/moscow/html/r_index.htm - -URL1: http://meteo.infospace.ru/koi/wcond/html/r_index.ssi -URL2: http://meteo.infospace.ru/win/wcond/html/r_index.ssi - -URL1: http://koi.dzik.aha.ru/ -URL2: http://www.dzik.aha.ru/ - -URL1: http://www-psb.ad-sbras.nsc.ru/kruglk.htm -URL2: http://www-psb.ad-sbras.nsc.ru/kruglw.htm diff --git a/mz-unescape b/mz-unescape new file mode 100755 index 0000000..8bf8865 --- /dev/null +++ b/mz-unescape @@ -0,0 +1,6 @@ +#! /bin/sh + +dest_dir=$HOME/.mozilla/phd/`ls -1 $HOME/.mozilla/phd` +sed -e 's/<//g' -e 's/&/\&/g' \ + $dest_dir/bookmarks.html >_tmp.$$ && \ +exec mv _tmp.$$ $dest_dir/bookmarks.html diff --git a/ns-unescape b/ns-unescape new file mode 100755 index 0000000..90f4f8f --- /dev/null +++ b/ns-unescape @@ -0,0 +1,5 @@ +#! /bin/sh + +sed -e 's/<//g' -e 's/&/\&/g' \ + $HOME/.netscape/bookmarks.html >_tmp.$$ && \ +exec mv _tmp.$$ $HOME/.netscape/bookmarks.html diff --git a/readme b/readme deleted file mode 100644 index 13d197f..0000000 --- a/readme +++ /dev/null @@ -1,207 +0,0 @@ - - BOOKMARKS database and internet robot - - Here is a set of classes, libraries and programs I use to manipulate my -bookmarks.html. I like Netscape Navigator, but I need more features, so I am -writing these programs for my needs. I need to extend Navigator's "What's new" -feature (Navigator 4 named it "Update bookmarks"). - - These programs are intended to run as follows. -1. bkmk2db converts bookmarks.html to bookmarks.db. -2. chk_urls (Internet robot) runs against bookmarks.db, checks every URL and - saves results in check.db. -3. db2bkmk converts bookmarks.db back to bookmarks.html. - Then I use this bookmarks file and... -4. bkmk2db converts bookmarks.html to bookmarks.db. -5. chk_urls (Internet robot) runs against bookmarks.db, checks every URL and - saves results in check.db (old file copied to check.old). -6. (An yet unnamed program) will compare check.old with check.db and generate -detailed report. For example: - this URL is unchanged - this URL is changed - this URL is unavailable due to: host not found... - - Bookmarks database programs are almost debugged. What need to be done is -support for aliases. Second version of the internet robot is finished. - - Although not required, these programs work fine with tty_pbar.py (my little -module for creating text-mode progress bars). - -COPYRIGHT and LEGAL ISSUES - All programs copyrighted by Oleg Broytmann and PhiloSoft Design. All -sources protected by GNU GPL. Programs are provided "as-is", without any kind -of warranty. All usual blah-blah-blah. - - #include - - ------------------------------- bkmk2db ------------------------------ - NAME - bkmk2db.py - script to convert bookmarks.html to FLAD database. - - SYNOPSIS - bkmk2db.py [-its] [/path/to/bookmarks.html] - - DESCRIPTION - bkmk2db.py splits given file (or ./bookmarks.html) into FLAD database - bookmarks.db in current directory. - - Options: - -i - Inhibit progress bar. Default is to display progress bar if - stderr.isatty() - - -t - Convert to text file (for debugging). Default is to convert to - FLAD. - - -s - Suppress output of statistics at the end of the program. Default - is to write how many lines the program read and how many URLs - parsed. Also suppress some messages during run. - - BUGS - The program starts working by writing lines to header file until - BookmarksParser initializes its own output file (this occur when - parser encountered 1st
tag). It is misdesign. - - Empty comments (no text after
) are not marked specially in - database, so db2bkmk.py will not reconstruct it. I don't need empty -
s, so I consider it as feature, not a real bug. - - Aliases are not supported (yet). - - ------------------------------- db2bkmk ------------------------------ - NAME - db2bkmk.py - script to reconstruct bookmarks.html back from FLAD - database. - - SYNOPSIS - db2bkmk.py [-is] [-t dict.db [-r]] - - DESCRIPTION - db2bkmk.py reads bookmarks.db and creates two HTML files - - public.html and private.html. The latter is just full - bookmarks.html, while the former file hides private folder. - - Options: - -i - Inhibit progress bar. Default is to display progress bar if - stderr.isatty() - - -s - Suppress output of statistics at the end of the program. Default is - to write how many records the program proceed and how many URLs - created. Also suppress some messages during run. - - -t dict.db - For most tasks, if someone need to process bookmarks.db in a - regular way (for example, replace all "gopher://gopher." with - "http://www."), it is easy to write special program, processing - every DB record. For some tasks it is even simpler and faster to - write sed/awk scripts. But there are cases when someone need to - process bookmarks.db in a non-regular way: one URL must be changed - in one way, another URL - in second way, etc. The -t option allows - to use external dictionary for such translation. The dictionary - itself is again FLAD database, where every record have two keys - - URL1 and URL2. With -t option in effect, db2bkmk generates - {private,public}.html, renames them to {private,public}.1, and - then translates the entire bookmarks.db again, generating - {private,public}.2 (totally 4 files), where every URL1 replaced - with URL2 from dictionary. (See koi2win.db for example of - translation dictionary) - - -r - Reverse the effect of -t option - translate from URL2 to URL1. - - BUGS - There are three hacks under line marked with "Dirty hacks here": - 1. if record["Folder"] == "Private links": - This is to hide passwords from my bookmarks file. - - 2. if record["Folder"] == "All the rest - Unclassified": - outfile.write(" "*level + "

real_title: + object.name = real_title + changed += 1 + + + if changed and report_stats: + sys.stdout.write("Saving %s: " % storage.filename) + sys.stdout.flush() + + if not changed and report_stats: + sys.stdout.write("No need to save data\n") + sys.stdout.flush() + + if changed: + storage.store(root_folder) + + if changed and report_stats: + print "Ok" + print objects, "objects passed" + print changed, "objects changed" + + +if __name__ == '__main__': + run() diff --git a/sort_db.py b/sort_db.py new file mode 100755 index 0000000..60e3d41 --- /dev/null +++ b/sort_db.py @@ -0,0 +1,117 @@ +#! /usr/local/bin/python -O +""" + Sort bookmarks DB according to a rule: + -a - by AddDate + -v - by LastVisit + -m - by LastModified + -z - by Size + -t - by LastTested + default is -m + -r - reverse the sort order + + Written by BroytMann, Apr 2000. Copyright (C) 2000 PhiloSoft Design +""" + + +import sys + + +class SortBy: + def __init__(self, sort_by): + self.sort_by = sort_by + + def __call__(self, o1, o2): + try: + attr1 = int(getattr(o1, self.sort_by)) + except (TypeError, AttributeError): + return 1 + + try: + attr2 = int(getattr(o2, self.sort_by)) + except (TypeError, AttributeError): + return -1 + + return cmp(attr1, attr2) + + +def walk_linear(linear, walker): + for object in linear: + if object.isBookmark: + walker.bookmark(object, 0) + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "avmztrs") + + sort_by = "last_modified" + reverse = 0 + report_stats = 1 + + for _opt, _arg in optlist: + if _opt == '-a': + sort_by = "add_date" + elif _opt == '-v': + sort_by = "last_visit" + elif _opt == '-m': + sort_by = "last_modified" + elif _opt == '-z': + sort_by = "size" + elif _opt == '-t': + sort_by = "last_tested" + elif _opt == '-r': + reverse = 1 + elif _opt == '-s': + report_stats = 0 + try: + del _opt, _arg + except NameError: + pass + + from storage import storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + + if report_stats: + print "Ok" + sys.stdout.write("Sorting (by %s): " % sort_by) + sys.stdout.flush() + + from bkmk_objects import make_linear + make_linear(root_folder) + + linear = root_folder.linear + del linear[0] # exclude root folder from sorting + + by = SortBy(sort_by) + linear.sort(by) + + from writers import writer + output_filename = "%s-sorted_by-%s" % (writer.filename, sort_by) + + if reverse: + linear.reverse() + output_filename = output_filename + "-reverse" + + if report_stats: + print "done" + sys.stdout.write("Writing %s: " % output_filename) + sys.stdout.flush() + + outfile = open(output_filename, 'w') + writer = writer(outfile) + writer.root_folder(root_folder) + walk_linear(linear, writer) + outfile.close() + + if report_stats: + print "Ok" + + +if __name__ == '__main__': + run() diff --git a/storage.py b/storage.py new file mode 100644 index 0000000..0924301 --- /dev/null +++ b/storage.py @@ -0,0 +1,14 @@ +""" + Thin wrapper for module Storage. Provides "default" storage +""" + + +from os import environ +storage_name = environ.get("BKMK_STORAGE", "pickle") + +def import_storage(storage_name): + exec "from Storage import bkmk_st%s" % storage_name + exec "storage = bkmk_st%s.storage_%s" % (storage_name, storage_name) + return storage + +storage = import_storage(storage_name) diff --git a/writers.py b/writers.py new file mode 100644 index 0000000..00849ea --- /dev/null +++ b/writers.py @@ -0,0 +1,14 @@ +""" + Thin wrapper for module Writers. Provides "default" writer +""" + + +from os import environ +writer_name = environ.get("BKMK_WRITER", "html") + +def import_writer(writer_name): + exec "from Writers import bkmk_w%s" % writer_name + exec "writer = bkmk_w%s.writer_%s" % (writer_name, writer_name) + return writer + +writer = import_writer(writer_name) -- 2.39.5