--- /dev/null
+"""Provide a (g)dbm-compatible interface to MetaKit.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2002 PhiloSoft Design
+License: Python"""
+
+
+import sys
+try:
+ import Mk4py
+except ImportError:
+ # prevent a second import of this module from spuriously succeeding
+ del sys.modules[__name__]
+ raise
+
+
+__all__ = ["error", "open"]
+
+error = ValueError
+
+
+class MKhash:
+ def __init__(self, file, flag, mode=0666, trans_threshold=1000):
+ self.read_only = 0
+ self._closed = 0
+
+ self.trans_threshold = trans_threshold
+ self._transcount = 0 # transactions counter - for commiting transactions
+
+ if flag in ('c', 'n'):
+ mode = 1
+ elif flag == 'r':
+ mode = 0
+ self.read_only = 1
+ else:
+ mode = 2
+
+ self.db = db = Mk4py.storage(file, mode)
+ if mode == 1:
+ self.vw = db.getas("hash[key:S,value:S]")
+ else:
+ self.vw = db.view("hash")
+
+ def __del__(self):
+ self.close()
+
+ def keys(self):
+ return map(lambda x: x.key, self.vw)
+
+ def __len__(self):
+ return len(self.vw)
+
+ def has_key(self, key):
+ return self.vw.find(key=key)+1
+
+ def get(self, key, default=None):
+ if self.has_key(key):
+ return self[key]
+ return default
+
+ def __getitem__(self, key):
+ vw = self.vw
+ ix = vw.find(key=key)
+ if ix == -1:
+ raise KeyError, key
+ return vw[ix].value
+
+ def __setitem__(self, key, value):
+ vw = self.vw
+ ix = vw.find(key=key)
+ if ix == -1:
+ vw.append(key=key, value=value)
+ else:
+ vw[ix].value = value
+ self._add_tran()
+
+ def __delitem__(self, key):
+ vw = self.vw
+ ix = vw.find(key=key)
+ if ix == -1:
+ raise KeyError, key
+ vw.delete(ix)
+ self._add_tran()
+
+ def close(self):
+ if self._closed: return
+ if not self.read_only: self.db.commit()
+ del self.db
+ self._closed = 1
+
+ def _add_tran(self):
+ self._transcount = self._transcount + 1
+ if self._transcount == self.trans_threshold:
+ self._transcount = 0
+ self.db.commit()
+
+
+def open(file, flag, mode=0666):
+ return MKhash(file, flag, mode)
--- /dev/null
+"""Provide a (g)dbm-compatible interface to ZODB.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2002 PhiloSoft Design
+License: Python"""
+
+
+import sys
+try:
+ from ZODB import FileStorage, DB, POSException
+except ImportError:
+ # prevent a second import of this module from spuriously succeeding
+ del sys.modules[__name__]
+ raise
+
+
+__all__ = ["error", "open"]
+
+error = POSException.POSError # Exported for anydbm
+
+
+class ZODBhash:
+ def __init__(self, file, flag, mode=0666, trans_threshold=1000):
+ create = (flag == 'n') # force recreation
+ # if flag == 'w' or 'c' and file does not exist FileStorage will set it to 1 for us
+
+ self.read_only = read_only = (flag == 'r')
+ self._closed = 0
+
+ self.trans_threshold = trans_threshold
+ self._transcount = 0 # transactions counter - for commiting transactions
+
+ storage = FileStorage.FileStorage(file, create=create, read_only = read_only)
+ db = DB(storage)
+ self.conn = conn = db.open()
+ self.dbroot = conn.root()
+
+ def __del__(self):
+ self.close()
+
+ def keys(self):
+ return self.dbroot.keys()
+
+ def __len__(self):
+ return len(self.dbroot)
+
+ def has_key(self, key):
+ return self.dbroot.has_key(key)
+
+ def get(self, key, default=None):
+ if self.dbroot.has_key(key):
+ return self[key]
+ return default
+
+ def __getitem__(self, key):
+ return self.dbroot[key]
+
+ def __setitem__(self, key, value):
+ self.dbroot[key] = value
+ self._add_tran()
+
+ def __delitem__(self, key):
+ del self.dbroot[key]
+ self._add_tran()
+
+ def close(self):
+ if self._closed: return
+ if not self.read_only:
+ get_transaction().commit()
+ self.conn.db().close()
+ self.conn.close()
+ self._closed = 1
+
+ def _add_tran(self):
+ self._transcount = self._transcount + 1
+ if self._transcount == self.trans_threshold:
+ self._transcount = 0
+ get_transaction().commit()
+
+
+def open(file, flag, mode=0666):
+ return ZODBhash(file, flag, mode)
--- /dev/null
+"""Extended disk hashes package. It extends anydbm/whichdb with ZODB and
+MetaKit-based hashes.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2003 PhiloSoft Design
+License: Python"""
+
+
+__all__ = ["zshelve", "ZODBhash", "MKhash"]
+
+
+import anydbm
+anydbm._names.insert(len(anydbm._names)-1, ['ZODBhash', 'MKhash'])
+ # Insert before dumbdbm
+
+
+import whichdb
+_orig_module = whichdb
+_orig_whichdb = _orig_module.whichdb
+
+def whichdb(filename):
+ result = _orig_whichdb(filename)
+ if result:
+ return result
+
+ try:
+ f = open(filename, "rb")
+ except IOError:
+ return None
+
+ # Read the start of the file -- the magic number
+ s = f.read(4)
+ f.close()
+
+ # Return "" if not at least 4 bytes
+ if len(s) != 4:
+ return ""
+
+ # Check for MetaKit
+ if s == "JL\x1A\0":
+ return "MKhash"
+
+ # Check for ZODB
+ if s == "FS21":
+ return "ZODBhash"
+
+ # Unknown
+ return ""
+
+_orig_module.whichdb = whichdb # Now install our extended replacement
+whichdb.__doc__ = _orig_whichdb.__doc__
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Making..."
+db = MKhash.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Testing..."
+db = MKhash.open("db", 'w')
+print db["test"]
+print len(db)
+print db.keys()
+print db.has_key("test")
+print db.has_key("Test")
+print db.get("test", "Yes")
+print db.get("Test", "No")
+del db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Testing (more)..."
+db = MKhash.open("db", 'r')
+print len(db)
+print db.keys()
+print db.has_key("test")
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+import shelve
+
+db = shelve.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+import shelve
+
+db = shelve.open("db", 'r')
+print db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedShelf("dbz", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedShelf("dbz", 'r')
+print db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'n')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db.has_key("test")
+print db.keys()
+print db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+import anydbm, dbhash
+anydbm._defaultmod = dbhash
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'n')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db.has_key("test")
+print db.keys()
+print db["test"]
+db.close()
--- /dev/null
+#! /usr/bin/env python
+
+
+from m_lib.hash import ZODBhash
+
+
+print "Making..."
+db = ZODBhash.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+print "Testing..."
+db = ZODBhash.open("db", 'w')
+print db["test"]
+print len(db)
+print db.keys()
+print db.has_key("test")
+print db.has_key("Test")
+print db.get("test", "Yes")
+print db.get("Test", "No")
+del db["test"]
+db.close()
+
+print "Testing (more)..."
+db = ZODBhash.open("db", 'r')
+print len(db)
+print db.keys()
+print db.has_key("test")
+db.close()
--- /dev/null
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<HEAD>
+<TITLE>Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash</TITLE>
+<META NAME="description" CONTENT="Broytman Python Software">
+<META NAME="author" CONTENT="Oleg Broytman">
+<LINK REV="made" HREF="phd@phd.pp.ru">
+<link rel="stylesheet" type="text/css" href="../../../phd.css">
+</HEAD>
+
+<BODY>
+
+<H1>Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash</H1>
+
+<p class="head">
+</p>
+
+
+</BODY>
+</HTML>
--- /dev/null
+
+ I am providing here two additional modules and two patches for the
+standard library.
+
+ Those two modules are ZODBhash and MKhash. They provide dbm-like
+interface based on ZODB and MetaKit. They are intended to be used by
+anydbm, so I am also providing corresponding patches for anydbm.py and
+whichdb.py.
+
+ Download mzhash.zip - it contains modules, patches and simple tests.
+
+ Also I made a patch for the shelve.py module. I created two additional
+shalve - CompressedShelf and CompressedKeysShelf. These shelve use zlib to
+compress/decompress data. CompressedShelf compresses only data, and
+CompressedKeysShelf compresses both data and keys.
+
+ Download mshelve.zip.
+
+ Below is a long story why I created all this and how I compared them.
+
+ I started with the need to create ispell-like hash with all forms of
+every word. I needed this for full-text search. (BTW, I think it'd be nice
+to include this kind of search into ZCatalog; I'll think about it later). I
+looked into ispell and htdig sources and manuals, and found that I'd better
+write my own programs and libraries instead of trying to wrap those
+complex ones.
+
+ I found (in ispell manual) I can generate simple text file with all
+neccessary information: ispell -e <russian.dict | sort >ruusian.words. So
+the task is to construct a hash for fast access to this information.
+
+ Very easy, thanks Python! Just read every line, split it and put into
+disk-based hash (anydbm!).
+
+ I wrote the program in a minute. The program generates two hashes. One
+hash, words2root maps every word to its normal form ("passing" => "pass").
+Another, root2words maps normal form to the list of all forms ("pass" =>
+["pass", "passed", "passing", "passes", "passable", "impassable"]). The
+hashes are named after htdig, of course.
+
+ The first run was a surprise. It was running for 5 hours, swapping a
+lot, and finally it generates two 85-megabytes files (Berkeley DB hashes).
+170 megs from 10M text file! Wow!!!
+
+ So I started to think I want to experiment with other disk-based hashes,
+and I wanted to find a way to speed things up and lower disk requirements.
+
+ Next thing I tried - ZODB. ZODB is itself hash (a sort of), so I easily
+wrote ZODBhash wrapper. I reran my program. It failed. ZODB ate /tmp very
+fast - 700 megabatyes by one hour. I tried to commit subtransactions or
+even transactions during write (__setitem__), but this was of not much
+help, and my program stopped by IOError, "no space left on device" :(
+
+ Then I tried to to write compressed data to the hash. I created two
+shelve - CompressedShelf and CompressedKeysShelf and tried them with bsddb.
+I cleared my computer from all jobs, stopped XWindows, etc - and reran the
+program two times - with Shelf and CompressedKeysShelf. Shelf created 2 85
+megs files in 3 hours, and CompressedShelf created 2 files - one 85 and the
+other 21 megs - in 3.5 hours. Win in disk space (not much) and loose in
+time.
+
+ I tried to use gdbm instead of bsddb. Again, I ran the program two
+times. Result: Shelf - 120 and 50 megs in 5 hours, CompressedKeysShelf -
+120 and 13 megs in 4 hours. Some win and some loose. During the runs my
+computer swapped a bit less than when I used Berkeley DB, so it seems gdbm
+uses less memory.
--- /dev/null
+"""Compressed shelves.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2003 PhiloSoft Design
+License: Python"""
+
+
+from shelve import DbfilenameShelf
+from zlib import compress, decompress
+
+try:
+ from cPickle import dumps, loads
+except ImportError:
+ from Pickle import dumps, loads
+
+
+class CompressedShelf(DbfilenameShelf):
+ """Shelf implementation using zlib for compressing data."""
+
+ compress_level = 6 # default compression
+
+ def __getitem__(self, key):
+ return loads(decompress(self.dict[key]))
+
+ def __setitem__(self, key, value):
+ self.dict[key] = compress(dumps(value), self.compress_level)
+
+
+class CompressedKeysShelf(CompressedShelf):
+ """CompressedShelf implementation that also compresses keys."""
+
+ def keys(self):
+ _keys = []
+ for key in self.dict.keys():
+ _keys.append(decompress(key))
+ return _keys
+
+ def has_key(self, key):
+ return self.dict.has_key(compress(key, self.compress_level))
+
+ def __getitem__(self, key):
+ return CompressedShelf.__getitem__(self, compress(key, self.compress_level))
+
+ def __setitem__(self, key, value):
+ CompressedShelf.__setitem__(self, compress(key, self.compress_level), value)
+
+ def __delitem__(self, key):
+ del self.dict[compress(key, self.compress_level)]
url = "http://phdru.name/Software/Python/#m_lib",
license = "GPL",
platforms = "All",
- packages = ["m_lib", "m_lib.clock",
- "m_lib.flad", "m_lib.flad.test", "m_lib.lazy",
+ packages = ["m_lib", "m_lib.clock", "m_lib.flad", "m_lib.flad.test",
+ "m_lib.hash", "m_lib.hash.test", "m_lib.lazy",
"m_lib.net", "m_lib.net.ftp", "m_lib.net.www", "m_lib.rus",
],
data_files = [("%s/m_lib/flad/test" % python_lib, [