Import m_lib/hash

author Oleg Broytman <phd@phdru.name>

Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)

committer Oleg Broytman <phd@phdru.name>

Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)
author Oleg Broytman <phd@phdru.name>
Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)
committer Oleg Broytman <phd@phdru.name>
Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)
diff --git a/m_lib/hash/MKhash.py b/m_lib/hash/MKhash.py

new file mode 100644 (file)

index 0000000..707d9fe
--- /dev/null
+++ b/m_lib/hash/MKhash.py
@@ -0,0 +1,98 @@
+"""Provide a (g)dbm-compatible interface to MetaKit.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2002 PhiloSoft Design
+License: Python"""
+
+
+import sys
+try:
+    import Mk4py
+except ImportError:
+    # prevent a second import of this module from spuriously succeeding
+    del sys.modules[__name__]
+    raise
+
+
+__all__ = ["error", "open"]
+
+error = ValueError
+
+
+class MKhash:
+    def __init__(self, file, flag, mode=0666, trans_threshold=1000):
+        self.read_only = 0
+        self._closed = 0
+
+        self.trans_threshold = trans_threshold
+        self._transcount = 0 # transactions counter - for commiting transactions
+
+        if flag in ('c', 'n'):
+            mode = 1
+        elif flag == 'r':
+            mode = 0
+            self.read_only = 1
+        else:
+            mode = 2
+
+        self.db = db = Mk4py.storage(file, mode)
+        if mode == 1:
+            self.vw = db.getas("hash[key:S,value:S]")
+        else:
+            self.vw = db.view("hash")
+
+    def __del__(self):
+        self.close()
+
+    def keys(self):
+        return map(lambda x: x.key, self.vw)
+
+    def __len__(self):
+        return len(self.vw)
+
+    def has_key(self, key):
+        return self.vw.find(key=key)+1
+
+    def get(self, key, default=None):
+        if self.has_key(key):
+            return self[key]
+        return default
+
+    def __getitem__(self, key):
+        vw = self.vw
+        ix = vw.find(key=key)
+        if ix == -1:
+            raise KeyError, key
+        return vw[ix].value
+
+    def __setitem__(self, key, value):
+        vw = self.vw
+        ix = vw.find(key=key)
+        if ix == -1:
+            vw.append(key=key, value=value)
+        else:
+            vw[ix].value = value
+        self._add_tran()
+
+    def __delitem__(self, key):
+        vw = self.vw
+        ix = vw.find(key=key)
+        if ix == -1:
+            raise KeyError, key
+        vw.delete(ix)
+        self._add_tran()
+
+    def close(self):
+        if self._closed: return
+        if not self.read_only: self.db.commit()
+        del self.db
+        self._closed = 1
+
+    def _add_tran(self):
+        self._transcount = self._transcount + 1
+        if self._transcount == self.trans_threshold:
+            self._transcount = 0
+            self.db.commit()
+
+
+def open(file, flag, mode=0666):
+    return MKhash(file, flag, mode)
diff --git a/m_lib/hash/ZODBhash.py b/m_lib/hash/ZODBhash.py

new file mode 100644 (file)

index 0000000..13073c5
--- /dev/null
+++ b/m_lib/hash/ZODBhash.py
@@ -0,0 +1,81 @@
+"""Provide a (g)dbm-compatible interface to ZODB.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2002 PhiloSoft Design
+License: Python"""
+
+
+import sys
+try:
+    from ZODB import FileStorage, DB, POSException
+except ImportError:
+    # prevent a second import of this module from spuriously succeeding
+    del sys.modules[__name__]
+    raise
+
+
+__all__ = ["error", "open"]
+
+error = POSException.POSError                     # Exported for anydbm
+
+
+class ZODBhash:
+    def __init__(self, file, flag, mode=0666, trans_threshold=1000):
+        create = (flag == 'n') # force recreation
+        # if flag == 'w' or 'c' and file does not exist FileStorage will set it to 1 for us 
+
+        self.read_only = read_only = (flag == 'r')
+        self._closed = 0
+
+        self.trans_threshold = trans_threshold
+        self._transcount = 0 # transactions counter - for commiting transactions
+
+        storage = FileStorage.FileStorage(file, create=create, read_only = read_only)
+        db = DB(storage)
+        self.conn = conn = db.open()
+        self.dbroot = conn.root()
+
+    def __del__(self):
+        self.close()
+
+    def keys(self):
+        return self.dbroot.keys()
+
+    def __len__(self):
+        return len(self.dbroot)
+
+    def has_key(self, key):
+        return self.dbroot.has_key(key)
+
+    def get(self, key, default=None):
+        if self.dbroot.has_key(key):
+            return self[key]
+        return default
+
+    def __getitem__(self, key):
+        return self.dbroot[key]
+
+    def __setitem__(self, key, value):
+        self.dbroot[key] = value
+        self._add_tran()
+
+    def __delitem__(self, key):
+        del self.dbroot[key]
+        self._add_tran()
+
+    def close(self):
+        if self._closed: return
+        if not self.read_only:
+            get_transaction().commit()
+            self.conn.db().close()
+        self.conn.close()
+        self._closed = 1
+
+    def _add_tran(self):
+        self._transcount = self._transcount + 1
+        if self._transcount == self.trans_threshold:
+            self._transcount = 0
+            get_transaction().commit()
+
+
+def open(file, flag, mode=0666):
+    return ZODBhash(file, flag, mode)
diff --git a/m_lib/hash/__init__.py b/m_lib/hash/__init__.py

new file mode 100644 (file)

index 0000000..626db2b
--- /dev/null
+++ b/m_lib/hash/__init__.py
@@ -0,0 +1,50 @@
+"""Extended disk hashes package. It extends anydbm/whichdb with ZODB and
+MetaKit-based hashes.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2003 PhiloSoft Design
+License: Python"""
+
+
+__all__ = ["zshelve", "ZODBhash", "MKhash"]
+
+
+import anydbm
+anydbm._names.insert(len(anydbm._names)-1, ['ZODBhash', 'MKhash'])
+   # Insert before dumbdbm
+
+
+import whichdb
+_orig_module = whichdb
+_orig_whichdb = _orig_module.whichdb
+
+def whichdb(filename):
+    result = _orig_whichdb(filename)
+    if result:
+       return result
+
+    try:
+        f = open(filename, "rb")
+    except IOError:
+        return None
+
+    # Read the start of the file -- the magic number
+    s = f.read(4)
+    f.close()
+
+    # Return "" if not at least 4 bytes
+    if len(s) != 4:
+        return ""
+
+    # Check for MetaKit
+    if s == "JL\x1A\0":
+        return "MKhash"
+
+    # Check for ZODB
+    if s == "FS21":
+        return "ZODBhash"
+
+    # Unknown
+    return ""
+
+_orig_module.whichdb = whichdb # Now install our extended replacement
+whichdb.__doc__ = _orig_whichdb.__doc__
diff --git a/m_lib/hash/test/test-mk1.py b/m_lib/hash/test/test-mk1.py

new file mode 100755 (executable)

index 0000000..bd11165
--- /dev/null
+++ b/m_lib/hash/test/test-mk1.py
@@ -0,0 +1,10 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Making..."
+db = MKhash.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
diff --git a/m_lib/hash/test/test-mk2.py b/m_lib/hash/test/test-mk2.py

new file mode 100755 (executable)

index 0000000..d0ffe76
--- /dev/null
+++ b/m_lib/hash/test/test-mk2.py
@@ -0,0 +1,17 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Testing..."
+db = MKhash.open("db", 'w')
+print db["test"]
+print len(db)
+print db.keys()
+print db.has_key("test")
+print db.has_key("Test")
+print db.get("test", "Yes")
+print db.get("Test", "No")
+del db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-mk3.py b/m_lib/hash/test/test-mk3.py

new file mode 100755 (executable)

index 0000000..10bcb24
--- /dev/null
+++ b/m_lib/hash/test/test-mk3.py
@@ -0,0 +1,12 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import MKhash
+
+
+print "Testing (more)..."
+db = MKhash.open("db", 'r')
+print len(db)
+print db.keys()
+print db.has_key("test")
+db.close()
diff --git a/m_lib/hash/test/test-shelve1.py b/m_lib/hash/test/test-shelve1.py

new file mode 100755 (executable)

index 0000000..591bcd1
--- /dev/null
+++ b/m_lib/hash/test/test-shelve1.py
@@ -0,0 +1,8 @@
+#! /usr/bin/env python
+
+
+import shelve
+
+db = shelve.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
diff --git a/m_lib/hash/test/test-shelve2.py b/m_lib/hash/test/test-shelve2.py

new file mode 100755 (executable)

index 0000000..883e891
--- /dev/null
+++ b/m_lib/hash/test/test-shelve2.py
@@ -0,0 +1,8 @@
+#! /usr/bin/env python
+
+
+import shelve
+
+db = shelve.open("db", 'r')
+print db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-shelve3.py b/m_lib/hash/test/test-shelve3.py

new file mode 100755 (executable)

index 0000000..e6f76d3
--- /dev/null
+++ b/m_lib/hash/test/test-shelve3.py
@@ -0,0 +1,12 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedShelf("dbz", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedShelf("dbz", 'r')
+print db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-shelve4.py b/m_lib/hash/test/test-shelve4.py

new file mode 100755 (executable)

index 0000000..e084103
--- /dev/null
+++ b/m_lib/hash/test/test-shelve4.py
@@ -0,0 +1,12 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-shelve5.py b/m_lib/hash/test/test-shelve5.py

new file mode 100755 (executable)

index 0000000..fb859e4
--- /dev/null
+++ b/m_lib/hash/test/test-shelve5.py
@@ -0,0 +1,14 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'n')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db.has_key("test")
+print db.keys()
+print db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-shelve6.py b/m_lib/hash/test/test-shelve6.py

new file mode 100755 (executable)

index 0000000..3412bde
--- /dev/null
+++ b/m_lib/hash/test/test-shelve6.py
@@ -0,0 +1,18 @@
+#! /usr/bin/env python
+
+
+import anydbm, dbhash
+anydbm._defaultmod = dbhash
+
+
+from m_lib.hash import zshelve
+
+db = zshelve.CompressedKeysShelf("dbz", 'n')
+db["test"] = "Test Ok!"
+db.close()
+
+db = zshelve.CompressedKeysShelf("dbz", 'r')
+print db.has_key("test")
+print db.keys()
+print db["test"]
+db.close()
diff --git a/m_lib/hash/test/test-zh.py b/m_lib/hash/test/test-zh.py

new file mode 100755 (executable)

index 0000000..3095b14
--- /dev/null
+++ b/m_lib/hash/test/test-zh.py
@@ -0,0 +1,29 @@
+#! /usr/bin/env python
+
+
+from m_lib.hash import ZODBhash
+
+
+print "Making..."
+db = ZODBhash.open("db", 'c')
+db["test"] = "Test Ok!"
+db.close()
+
+print "Testing..."
+db = ZODBhash.open("db", 'w')
+print db["test"]
+print len(db)
+print db.keys()
+print db.has_key("test")
+print db.has_key("Test")
+print db.get("test", "Yes")
+print db.get("Test", "No")
+del db["test"]
+db.close()
+
+print "Testing (more)..."
+db = ZODBhash.open("db", 'r')
+print len(db)
+print db.keys()
+print db.has_key("test")
+db.close()
diff --git a/m_lib/hash/www/index.html b/m_lib/hash/www/index.html

new file mode 100644 (file)

index 0000000..0efe833
--- /dev/null
+++ b/m_lib/hash/www/index.html
@@ -0,0 +1,20 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<HEAD>
+<TITLE>Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash</TITLE>
+<META NAME="description" CONTENT="Broytman Python Software">
+<META NAME="author" CONTENT="Oleg Broytman">
+<LINK REV="made" HREF="phd@phd.pp.ru">
+<link rel="stylesheet" type="text/css" href="../../../phd.css">
+</HEAD>
+
+<BODY>
+
+<H1>Disk-based hashes: (g)dbm/bsddb, ZODBhash, MetaKitHash</H1>
+
+<p class="head">
+</p>
+
+
+</BODY>
+</HTML>
diff --git a/m_lib/hash/www/xxx.txt b/m_lib/hash/www/xxx.txt

new file mode 100644 (file)

index 0000000..8276a2e
--- /dev/null
+++ b/m_lib/hash/www/xxx.txt
@@ -0,0 +1,66 @@
+
+   I am providing here two additional modules and two patches for the
+standard library.
+
+   Those two modules are ZODBhash and MKhash. They provide dbm-like
+interface based on ZODB and MetaKit. They are intended to be used by
+anydbm, so I am also providing corresponding patches for anydbm.py and
+whichdb.py.
+
+   Download mzhash.zip - it contains modules, patches and simple tests.
+
+   Also I made a patch for the shelve.py module. I created two additional
+shalve - CompressedShelf and CompressedKeysShelf. These shelve use zlib to
+compress/decompress data. CompressedShelf compresses only data, and
+CompressedKeysShelf compresses both data and keys.
+
+   Download mshelve.zip.
+
+   Below is a long story why I created all this and how I compared them.
+
+   I started with the need to create ispell-like hash with all forms of
+every word. I needed this for full-text search. (BTW, I think it'd be nice
+to include this kind of search into ZCatalog; I'll think about it later). I
+looked into ispell and htdig sources and manuals, and found that I'd better
+write my own programs and libraries instead of trying to wrap those
+complex ones.
+
+   I found (in ispell manual) I can generate simple text file with all
+neccessary information: ispell -e <russian.dict | sort >ruusian.words. So
+the task is to construct a hash for fast access to this information.
+
+   Very easy, thanks Python! Just read every line, split it and put into
+disk-based hash (anydbm!).
+
+   I wrote the program in a minute. The program generates two hashes. One
+hash, words2root maps every word to its normal form ("passing" => "pass").
+Another, root2words maps normal form to the list of all forms ("pass" =>
+["pass", "passed", "passing", "passes", "passable", "impassable"]). The
+hashes are named after htdig, of course.
+
+   The first run was a surprise. It was running for 5 hours, swapping a
+lot, and finally it generates two 85-megabytes files (Berkeley DB hashes).
+170 megs from 10M text file! Wow!!!
+
+   So I started to think I want to experiment with other disk-based hashes,
+and I wanted to find a way to speed things up and lower disk requirements.
+
+   Next thing I tried - ZODB. ZODB is itself hash (a sort of), so I easily
+wrote ZODBhash wrapper. I reran my program. It failed. ZODB ate /tmp very
+fast - 700 megabatyes by one hour. I tried to commit subtransactions or
+even transactions during write (__setitem__), but this was of not much
+help, and my program stopped by IOError, "no space left on device" :(
+
+   Then I tried to to write compressed data to the hash. I created two
+shelve - CompressedShelf and CompressedKeysShelf and tried them with bsddb.
+I cleared my computer from all jobs, stopped XWindows, etc - and reran the
+program two times - with Shelf and CompressedKeysShelf. Shelf created 2 85
+megs files in 3 hours, and CompressedShelf created 2 files - one 85 and the
+other 21 megs - in 3.5 hours. Win in disk space (not much) and loose in
+time.
+
+   I tried to use gdbm instead of bsddb. Again, I ran the program two
+times. Result: Shelf - 120 and 50 megs in 5 hours, CompressedKeysShelf -
+120 and 13 megs in 4 hours. Some win and some loose. During the runs my
+computer swapped a bit less than when I used Berkeley DB, so it seems gdbm
+uses less memory.
diff --git a/m_lib/hash/zshelve.py b/m_lib/hash/zshelve.py

new file mode 100644 (file)

index 0000000..b9feb14
--- /dev/null
+++ b/m_lib/hash/zshelve.py
@@ -0,0 +1,47 @@
+"""Compressed shelves.
+Author: Oleg Broytman <phd@phd.pp.ru>
+Copyright (C) 2001-2003 PhiloSoft Design
+License: Python"""
+
+
+from shelve import DbfilenameShelf
+from zlib import compress, decompress
+
+try:
+   from cPickle import dumps, loads
+except ImportError:
+   from Pickle import dumps, loads
+
+
+class CompressedShelf(DbfilenameShelf):
+    """Shelf implementation using zlib for compressing data."""
+
+    compress_level = 6 # default compression
+
+    def __getitem__(self, key):
+        return loads(decompress(self.dict[key]))
+
+    def __setitem__(self, key, value):
+        self.dict[key] = compress(dumps(value), self.compress_level)
+
+
+class CompressedKeysShelf(CompressedShelf):
+    """CompressedShelf implementation that also compresses keys."""
+
+    def keys(self):
+        _keys = []
+        for key in self.dict.keys():
+            _keys.append(decompress(key))
+        return _keys
+
+    def has_key(self, key):
+        return self.dict.has_key(compress(key, self.compress_level))
+
+    def __getitem__(self, key):
+        return CompressedShelf.__getitem__(self, compress(key, self.compress_level))
+
+    def __setitem__(self, key, value):
+        CompressedShelf.__setitem__(self, compress(key, self.compress_level), value)
+
+    def __delitem__(self, key):
+        del self.dict[compress(key, self.compress_level)]
diff --git a/setup.py b/setup.py

index 670cbc9f28c49d2b535a39b7256c423261140141..c76dc8bba9cd747b5dc56749d7ccd38a11a8d605 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -11,8 +11,8 @@ setup(name = "m_lib",
      url = "http://phdru.name/Software/Python/#m_lib",
      license = "GPL",
      platforms = "All",
      url = "http://phdru.name/Software/Python/#m_lib",
      license = "GPL",
      platforms = "All",
-    packages = ["m_lib", "m_lib.clock",
-        "m_lib.flad", "m_lib.flad.test", "m_lib.lazy",
+    packages = ["m_lib", "m_lib.clock", "m_lib.flad", "m_lib.flad.test",
+        "m_lib.hash", "m_lib.hash.test", "m_lib.lazy",
          "m_lib.net", "m_lib.net.ftp", "m_lib.net.www", "m_lib.rus",
          ],
      data_files = [("%s/m_lib/flad/test" % python_lib, [
          "m_lib.net", "m_lib.net.ftp", "m_lib.net.www", "m_lib.rus",
          ],
      data_files = [("%s/m_lib/flad/test" % python_lib, [
author	Oleg Broytman <phd@phdru.name>
	Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)
committer	Oleg Broytman <phd@phdru.name>
	Sat, 30 Nov 2013 22:12:31 +0000 (02:12 +0400)
m_lib/hash/MKhash.py	[new file with mode: 0644]	patch \| blob
m_lib/hash/ZODBhash.py	[new file with mode: 0644]	patch \| blob
m_lib/hash/__init__.py	[new file with mode: 0644]	patch \| blob
m_lib/hash/test/test-mk1.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-mk2.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-mk3.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve1.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve2.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve3.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve4.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve5.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-shelve6.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/test/test-zh.py	[new file with mode: 0755]	patch \| blob
m_lib/hash/www/index.html	[new file with mode: 0644]	patch \| blob
m_lib/hash/www/xxx.txt	[new file with mode: 0644]	patch \| blob
m_lib/hash/zshelve.py	[new file with mode: 0644]	patch \| blob
setup.py		patch \| blob \| history