From: Oleg Broytman Date: Sat, 30 Nov 2013 22:12:31 +0000 (+0400) Subject: Import m_lib/hash X-Git-Tag: 2.1.3~34 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=83f94736a59fe357b8e50ebfbc548bdafe2ccda5;p=m_lib.git Import m_lib/hash --- diff --git a/m_lib/hash/MKhash.py b/m_lib/hash/MKhash.py new file mode 100644 index 0000000..707d9fe --- /dev/null +++ b/m_lib/hash/MKhash.py @@ -0,0 +1,98 @@ +"""Provide a (g)dbm-compatible interface to MetaKit. +Author: Oleg Broytman +Copyright (C) 2001-2002 PhiloSoft Design +License: Python""" + + +import sys +try: + import Mk4py +except ImportError: + # prevent a second import of this module from spuriously succeeding + del sys.modules[__name__] + raise + + +__all__ = ["error", "open"] + +error = ValueError + + +class MKhash: + def __init__(self, file, flag, mode=0666, trans_threshold=1000): + self.read_only = 0 + self._closed = 0 + + self.trans_threshold = trans_threshold + self._transcount = 0 # transactions counter - for commiting transactions + + if flag in ('c', 'n'): + mode = 1 + elif flag == 'r': + mode = 0 + self.read_only = 1 + else: + mode = 2 + + self.db = db = Mk4py.storage(file, mode) + if mode == 1: + self.vw = db.getas("hash[key:S,value:S]") + else: + self.vw = db.view("hash") + + def __del__(self): + self.close() + + def keys(self): + return map(lambda x: x.key, self.vw) + + def __len__(self): + return len(self.vw) + + def has_key(self, key): + return self.vw.find(key=key)+1 + + def get(self, key, default=None): + if self.has_key(key): + return self[key] + return default + + def __getitem__(self, key): + vw = self.vw + ix = vw.find(key=key) + if ix == -1: + raise KeyError, key + return vw[ix].value + + def __setitem__(self, key, value): + vw = self.vw + ix = vw.find(key=key) + if ix == -1: + vw.append(key=key, value=value) + else: + vw[ix].value = value + self._add_tran() + + def __delitem__(self, key): + vw = self.vw + ix = vw.find(key=key) + if ix == -1: + raise KeyError, key + vw.delete(ix) + self._add_tran() + + def close(self): + if self._closed: return + if not self.read_only: self.db.commit() + del self.db + self._closed = 1 + + def _add_tran(self): + self._transcount = self._transcount + 1 + if self._transcount == self.trans_threshold: + self._transcount = 0 + self.db.commit() + + +def open(file, flag, mode=0666): + return MKhash(file, flag, mode) diff --git a/m_lib/hash/ZODBhash.py b/m_lib/hash/ZODBhash.py new file mode 100644 index 0000000..13073c5 --- /dev/null +++ b/m_lib/hash/ZODBhash.py @@ -0,0 +1,81 @@ +"""Provide a (g)dbm-compatible interface to ZODB. +Author: Oleg Broytman +Copyright (C) 2001-2002 PhiloSoft Design +License: Python""" + + +import sys +try: + from ZODB import FileStorage, DB, POSException +except ImportError: + # prevent a second import of this module from spuriously succeeding + del sys.modules[__name__] + raise + + +__all__ = ["error", "open"] + +error = POSException.POSError # Exported for anydbm + + +class ZODBhash: + def __init__(self, file, flag, mode=0666, trans_threshold=1000): + create = (flag == 'n') # force recreation + # if flag == 'w' or 'c' and file does not exist FileStorage will set it to 1 for us + + self.read_only = read_only = (flag == 'r') + self._closed = 0 + + self.trans_threshold = trans_threshold + self._transcount = 0 # transactions counter - for commiting transactions + + storage = FileStorage.FileStorage(file, create=create, read_only = read_only) + db = DB(storage) + self.conn = conn = db.open() + self.dbroot = conn.root() + + def __del__(self): + self.close() + + def keys(self): + return self.dbroot.keys() + + def __len__(self): + return len(self.dbroot) + + def has_key(self, key): + return self.dbroot.has_key(key) + + def get(self, key, default=None): + if self.dbroot.has_key(key): + return self[key] + return default + + def __getitem__(self, key): + return self.dbroot[key] + + def __setitem__(self, key, value): + self.dbroot[key] = value + self._add_tran() + + def __delitem__(self, key): + del self.dbroot[key] + self._add_tran() + + def close(self): + if self._closed: return + if not self.read_only: + get_transaction().commit() + self.conn.db().close() + self.conn.close() + self._closed = 1 + + def _add_tran(self): + self._transcount = self._transcount + 1 + if self._transcount == self.trans_threshold: + self._transcount = 0 + get_transaction().commit() + + +def open(file, flag, mode=0666): + return ZODBhash(file, flag, mode) diff --git a/m_lib/hash/__init__.py b/m_lib/hash/__init__.py new file mode 100644 index 0000000..626db2b --- /dev/null +++ b/m_lib/hash/__init__.py @@ -0,0 +1,50 @@ +"""Extended disk hashes package. It extends anydbm/whichdb with ZODB and +MetaKit-based hashes. +Author: Oleg Broytman +Copyright (C) 2001-2003 PhiloSoft Design +License: Python""" + + +__all__ = ["zshelve", "ZODBhash", "MKhash"] + + +import anydbm +anydbm._names.insert(len(anydbm._names)-1, ['ZODBhash', 'MKhash']) + # Insert before dumbdbm + + +import whichdb +_orig_module = whichdb +_orig_whichdb = _orig_module.whichdb + +def whichdb(filename): + result = _orig_whichdb(filename) + if result: + return result + + try: + f = open(filename, "rb") + except IOError: + return None + + # Read the start of the file -- the magic number + s = f.read(4) + f.close() + + # Return "" if not at least 4 bytes + if len(s) != 4: + return "" + + # Check for MetaKit + if s == "JL\x1A\0": + return "MKhash" + + # Check for ZODB + if s == "FS21": + return "ZODBhash" + + # Unknown + return "" + +_orig_module.whichdb = whichdb # Now install our extended replacement +whichdb.__doc__ = _orig_whichdb.__doc__ diff --git a/m_lib/hash/test/test-mk1.py b/m_lib/hash/test/test-mk1.py new file mode 100755 index 0000000..bd11165 --- /dev/null +++ b/m_lib/hash/test/test-mk1.py @@ -0,0 +1,10 @@ +#! /usr/bin/env python + + +from m_lib.hash import MKhash + + +print "Making..." +db = MKhash.open("db", 'c') +db["test"] = "Test Ok!" +db.close() diff --git a/m_lib/hash/test/test-mk2.py b/m_lib/hash/test/test-mk2.py new file mode 100755 index 0000000..d0ffe76 --- /dev/null +++ b/m_lib/hash/test/test-mk2.py @@ -0,0 +1,17 @@ +#! /usr/bin/env python + + +from m_lib.hash import MKhash + + +print "Testing..." +db = MKhash.open("db", 'w') +print db["test"] +print len(db) +print db.keys() +print db.has_key("test") +print db.has_key("Test") +print db.get("test", "Yes") +print db.get("Test", "No") +del db["test"] +db.close() diff --git a/m_lib/hash/test/test-mk3.py b/m_lib/hash/test/test-mk3.py new file mode 100755 index 0000000..10bcb24 --- /dev/null +++ b/m_lib/hash/test/test-mk3.py @@ -0,0 +1,12 @@ +#! /usr/bin/env python + + +from m_lib.hash import MKhash + + +print "Testing (more)..." +db = MKhash.open("db", 'r') +print len(db) +print db.keys() +print db.has_key("test") +db.close() diff --git a/m_lib/hash/test/test-shelve1.py b/m_lib/hash/test/test-shelve1.py new file mode 100755 index 0000000..591bcd1 --- /dev/null +++ b/m_lib/hash/test/test-shelve1.py @@ -0,0 +1,8 @@ +#! /usr/bin/env python + + +import shelve + +db = shelve.open("db", 'c') +db["test"] = "Test Ok!" +db.close() diff --git a/m_lib/hash/test/test-shelve2.py b/m_lib/hash/test/test-shelve2.py new file mode 100755 index 0000000..883e891 --- /dev/null +++ b/m_lib/hash/test/test-shelve2.py @@ -0,0 +1,8 @@ +#! /usr/bin/env python + + +import shelve + +db = shelve.open("db", 'r') +print db["test"] +db.close() diff --git a/m_lib/hash/test/test-shelve3.py b/m_lib/hash/test/test-shelve3.py new file mode 100755 index 0000000..e6f76d3 --- /dev/null +++ b/m_lib/hash/test/test-shelve3.py @@ -0,0 +1,12 @@ +#! /usr/bin/env python + + +from m_lib.hash import zshelve + +db = zshelve.CompressedShelf("dbz", 'c') +db["test"] = "Test Ok!" +db.close() + +db = zshelve.CompressedShelf("dbz", 'r') +print db["test"] +db.close() diff --git a/m_lib/hash/test/test-shelve4.py b/m_lib/hash/test/test-shelve4.py new file mode 100755 index 0000000..e084103 --- /dev/null +++ b/m_lib/hash/test/test-shelve4.py @@ -0,0 +1,12 @@ +#! /usr/bin/env python + + +from m_lib.hash import zshelve + +db = zshelve.CompressedKeysShelf("dbz", 'c') +db["test"] = "Test Ok!" +db.close() + +db = zshelve.CompressedKeysShelf("dbz", 'r') +print db["test"] +db.close() diff --git a/m_lib/hash/test/test-shelve5.py b/m_lib/hash/test/test-shelve5.py new file mode 100755 index 0000000..fb859e4 --- /dev/null +++ b/m_lib/hash/test/test-shelve5.py @@ -0,0 +1,14 @@ +#! /usr/bin/env python + + +from m_lib.hash import zshelve + +db = zshelve.CompressedKeysShelf("dbz", 'n') +db["test"] = "Test Ok!" +db.close() + +db = zshelve.CompressedKeysShelf("dbz", 'r') +print db.has_key("test") +print db.keys() +print db["test"] +db.close() diff --git a/m_lib/hash/test/test-shelve6.py b/m_lib/hash/test/test-shelve6.py new file mode 100755 index 0000000..3412bde --- /dev/null +++ b/m_lib/hash/test/test-shelve6.py @@ -0,0 +1,18 @@ +#! /usr/bin/env python + + +import anydbm, dbhash +anydbm._defaultmod = dbhash + + +from m_lib.hash import zshelve + +db = zshelve.CompressedKeysShelf("dbz", 'n') +db["test"] = "Test Ok!" +db.close() + +db = zshelve.CompressedKeysShelf("dbz", 'r') +print db.has_key("test") +print db.keys() +print db["test"] +db.close() diff --git a/m_lib/hash/test/test-zh.py b/m_lib/hash/test/test-zh.py new file mode 100755 index 0000000..3095b14 --- /dev/null +++ b/m_lib/hash/test/test-zh.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python + + +from m_lib.hash import ZODBhash + + +print "Making..." +db = ZODBhash.open("db", 'c') +db["test"] = "Test Ok!" +db.close() + +print "Testing..." +db = ZODBhash.open("db", 'w') +print db["test"] +print len(db) +print db.keys() +print db.has_key("test") +print db.has_key("Test") +print db.get("test", "Yes") +print db.get("Test", "No") +del db["test"] +db.close() + +print "Testing (more)..." +db = ZODBhash.open("db", 'r') +print len(db) +print db.keys() +print db.has_key("test") +db.close() diff --git a/m_lib/hash/www/index.html b/m_lib/hash/www/index.html new file mode 100644 index 0000000..0efe833 --- /dev/null +++ b/m_lib/hash/www/index.html @@ -0,0 +1,20 @@ + + + +Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash + + + + + + + + +

Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash

+ +

+

+ + + + diff --git a/m_lib/hash/www/xxx.txt b/m_lib/hash/www/xxx.txt new file mode 100644 index 0000000..8276a2e --- /dev/null +++ b/m_lib/hash/www/xxx.txt @@ -0,0 +1,66 @@ + + I am providing here two additional modules and two patches for the +standard library. + + Those two modules are ZODBhash and MKhash. They provide dbm-like +interface based on ZODB and MetaKit. They are intended to be used by +anydbm, so I am also providing corresponding patches for anydbm.py and +whichdb.py. + + Download mzhash.zip - it contains modules, patches and simple tests. + + Also I made a patch for the shelve.py module. I created two additional +shalve - CompressedShelf and CompressedKeysShelf. These shelve use zlib to +compress/decompress data. CompressedShelf compresses only data, and +CompressedKeysShelf compresses both data and keys. + + Download mshelve.zip. + + Below is a long story why I created all this and how I compared them. + + I started with the need to create ispell-like hash with all forms of +every word. I needed this for full-text search. (BTW, I think it'd be nice +to include this kind of search into ZCatalog; I'll think about it later). I +looked into ispell and htdig sources and manuals, and found that I'd better +write my own programs and libraries instead of trying to wrap those +complex ones. + + I found (in ispell manual) I can generate simple text file with all +neccessary information: ispell -e ruusian.words. So +the task is to construct a hash for fast access to this information. + + Very easy, thanks Python! Just read every line, split it and put into +disk-based hash (anydbm!). + + I wrote the program in a minute. The program generates two hashes. One +hash, words2root maps every word to its normal form ("passing" => "pass"). +Another, root2words maps normal form to the list of all forms ("pass" => +["pass", "passed", "passing", "passes", "passable", "impassable"]). The +hashes are named after htdig, of course. + + The first run was a surprise. It was running for 5 hours, swapping a +lot, and finally it generates two 85-megabytes files (Berkeley DB hashes). +170 megs from 10M text file! Wow!!! + + So I started to think I want to experiment with other disk-based hashes, +and I wanted to find a way to speed things up and lower disk requirements. + + Next thing I tried - ZODB. ZODB is itself hash (a sort of), so I easily +wrote ZODBhash wrapper. I reran my program. It failed. ZODB ate /tmp very +fast - 700 megabatyes by one hour. I tried to commit subtransactions or +even transactions during write (__setitem__), but this was of not much +help, and my program stopped by IOError, "no space left on device" :( + + Then I tried to to write compressed data to the hash. I created two +shelve - CompressedShelf and CompressedKeysShelf and tried them with bsddb. +I cleared my computer from all jobs, stopped XWindows, etc - and reran the +program two times - with Shelf and CompressedKeysShelf. Shelf created 2 85 +megs files in 3 hours, and CompressedShelf created 2 files - one 85 and the +other 21 megs - in 3.5 hours. Win in disk space (not much) and loose in +time. + + I tried to use gdbm instead of bsddb. Again, I ran the program two +times. Result: Shelf - 120 and 50 megs in 5 hours, CompressedKeysShelf - +120 and 13 megs in 4 hours. Some win and some loose. During the runs my +computer swapped a bit less than when I used Berkeley DB, so it seems gdbm +uses less memory. diff --git a/m_lib/hash/zshelve.py b/m_lib/hash/zshelve.py new file mode 100644 index 0000000..b9feb14 --- /dev/null +++ b/m_lib/hash/zshelve.py @@ -0,0 +1,47 @@ +"""Compressed shelves. +Author: Oleg Broytman +Copyright (C) 2001-2003 PhiloSoft Design +License: Python""" + + +from shelve import DbfilenameShelf +from zlib import compress, decompress + +try: + from cPickle import dumps, loads +except ImportError: + from Pickle import dumps, loads + + +class CompressedShelf(DbfilenameShelf): + """Shelf implementation using zlib for compressing data.""" + + compress_level = 6 # default compression + + def __getitem__(self, key): + return loads(decompress(self.dict[key])) + + def __setitem__(self, key, value): + self.dict[key] = compress(dumps(value), self.compress_level) + + +class CompressedKeysShelf(CompressedShelf): + """CompressedShelf implementation that also compresses keys.""" + + def keys(self): + _keys = [] + for key in self.dict.keys(): + _keys.append(decompress(key)) + return _keys + + def has_key(self, key): + return self.dict.has_key(compress(key, self.compress_level)) + + def __getitem__(self, key): + return CompressedShelf.__getitem__(self, compress(key, self.compress_level)) + + def __setitem__(self, key, value): + CompressedShelf.__setitem__(self, compress(key, self.compress_level), value) + + def __delitem__(self, key): + del self.dict[compress(key, self.compress_level)] diff --git a/setup.py b/setup.py index 670cbc9..c76dc8b 100755 --- a/setup.py +++ b/setup.py @@ -11,8 +11,8 @@ setup(name = "m_lib", url = "http://phdru.name/Software/Python/#m_lib", license = "GPL", platforms = "All", - packages = ["m_lib", "m_lib.clock", - "m_lib.flad", "m_lib.flad.test", "m_lib.lazy", + packages = ["m_lib", "m_lib.clock", "m_lib.flad", "m_lib.flad.test", + "m_lib.hash", "m_lib.hash.test", "m_lib.lazy", "m_lib.net", "m_lib.net.ftp", "m_lib.net.www", "m_lib.rus", ], data_files = [("%s/m_lib/flad/test" % python_lib, [