[Zope-CVS] CVS: Products/ZCTextIndex/tests - mhindex.py:1.8

Thu, 23 May 2002 00:07:58 -0400

Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv14945

Modified Files:
	mhindex.py 
Log Message:
Buncha updates:

- Use slightly more portable values for the Data.fs and
  Zope/lib/python.

- Add -t NNN option to specify how often to commit a transaction;
  default 20,000.

- Change -p into -p NNN to specify how often (counted in commits) to
  pack (default 0 -- never pack).

- Reworked the commit and pack logic to maintain the various counters
  across folders.

- Store relative paths (e.g. "inbox/1").

- Store the mtime of indexed messages in doctimes[docid].

- Store the mtime of indexed folders in watchfolders[folder] (unused).

- Refactor updatefolder() to:

  (a) Avoid indexing messages it's already indexed and whose mtime
      hasn't changed.  (This probably needs an override just in case.)

  (b) Unindex messages that no longer exist in the folder.

- Include the folder name and the message header fields from, to, cc,
  bcc, and subject in the text to be indexed.


=== Products/ZCTextIndex/tests/mhindex.py 1.7 => 1.8 ===
 """MH mail indexer."""
 
+import os
 import re
 import sys
 import time
@@ -9,17 +10,19 @@
 import getopt
 import traceback
 from StringIO import StringIO
+from stat import ST_MTIME
 
-DATAFS = "/home/guido/.Data.fs"
-ZOPECODE = "/home/guido/projects/ds9/lib/python"
+DATAFS = "~/.Data.fs"
+ZOPECODE = "~/projects/Zope/lib/python"
 
-sys.path.append(ZOPECODE)
+sys.path.append(os.path.expanduser(ZOPECODE))
 
 from ZODB import DB
 from ZODB.FileStorage import FileStorage
 from Persistence import Persistent
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.IIBTree import IIBTree
 
 from Products.ZCTextIndex.NBest import NBest
 from Products.ZCTextIndex.OkapiIndex import OkapiIndex
@@ -33,7 +36,7 @@
 
 def main():
     try:
-        opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
+        opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Op:t:u")
     except getopt.error, msg:
         print msg
         sys.exit(2)
@@ -42,8 +45,9 @@
     optimize = 0
     nbest = NBEST
     maxlines = MAXLINES
-    datafs = DATAFS
+    datafs = os.path.expanduser(DATAFS)
     pack = 0
+    trans = 20000
     for o, a in opts:
         if o == "-b":
             bulk = 1
@@ -56,18 +60,18 @@
         if o == "-O":
             optimize = 1
         if o == "-p":
-            pack = 1
+            pack = int(a)
+        if o == "-t":
+            trans = ont(a)
         if o == "-u":
             update = 1
-    ix = Indexer(datafs, update or bulk)
+    ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
     if bulk:
         if optimize:
             ix.optimize(args)
         ix.bulkupdate(args)
     elif update:
         ix.update(args)
-        if pack:
-            ix.pack()
     elif args:
         for i in range(len(args)):
             a = args[i]
@@ -79,12 +83,18 @@
         ix.query(" ".join(args), nbest, maxlines)
     else:
         ix.interact(nbest)
+    if pack:
+        ix.pack()
 
 class Indexer:
 
     filestorage = database = connection = root = None
 
-    def __init__(self, datafs, writable=0):
+    def __init__(self, datafs, writable=0, trans=0, pack=0):
+        self.trans_limit = trans
+        self.pack_limit = pack
+        self.trans_count = 0
+        self.pack_count = 0
         self.stopdict = get_stopdict()
         self.mh = mhlib.MH()
         self.filestorage = FileStorage(datafs, read_only=(not writable))
@@ -99,6 +109,14 @@
             self.docpaths = self.root["docpaths"]
         except KeyError:
             self.docpaths = self.root["docpaths"] = IOBTree()
+        try:
+            self.doctimes = self.root["doctimes"]
+        except KeyError:
+            self.doctimes = self.root["doctimes"] = IIBTree()
+        try:
+            self.watchfolders = self.root["watchfolders"]
+        except KeyError:
+            self.watchfolders = self.root["watchfolders"] = {}
         self.path2docid = OIBTree()
         for docid in self.docpaths.keys():
             path = self.docpaths[docid]
@@ -195,6 +213,7 @@
             path = self.docpaths[docid]
             score = min(100, int(score * factor))
             print "Rank:    %d   Score: %d%%   File: %s" % (rank, score, path)
+            path = os.path.join(self.mh.getpath(), path)
             fp = open(path)
             msg = mhlib.Message("<folder>", 0, fp)
             for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
@@ -254,6 +273,7 @@
         msgs.sort()
 
         self.updatefolder(f, msgs)
+        self.commit()
 
     def optimize(self, args):
         uniqwords = {}
@@ -279,19 +299,14 @@
         for n in msgs:
             print "prescanning", n
             m = f.openmessage(n)
-            text = self.getmessagetext(m)
+            text = self.getmessagetext(m, f.name)
             for p in pipeline:
                 text = p.process(text)
             for word in text:
                 uniqwords[word] = uniqwords.get(word, 0) + 1
 
     def bulkupdate(self, args):
-        chunk = 5000
-        target = len(self.docpaths) + chunk
         for folder in args:
-            if len(self.docpaths) >= target:
-                self.pack()
-                target = len(self.docpaths) + chunk
             if folder.startswith("+"):
                 folder = folder[1:]
             print "\nFOLDER", folder
@@ -302,31 +317,34 @@
                 continue
             self.updatefolder(f, f.listmessages())
             print "Total", len(self.docpaths)
-        self.pack()
+        self.commit()
         print "Indexed", self.index.lexicon._nbytes, "bytes and",
         print self.index.lexicon._nwords, "words;",
         print len(self.index.lexicon._words), "unique words."
 
     def updatefolder(self, f, msgs):
-        done = 0
-        new = 0
+        self.watchfolders[f.name] = self.getmtime(f.name)
         for n in msgs:
-            print "indexing", n
+            path = "%s/%s" % (f.name, n)
+            docid = self.path2docid.get(path, 0)
+            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
+                print "unchanged", docid, path
+                continue
+            docid = self.newdocid(path)
             m = f.openmessage(n)
-            text = self.getmessagetext(m)
-            path = f.getmessagefilename(n)
-            self.unindexpath(path)
+            text = self.getmessagetext(m, f.name)
             if not text:
+                self.unindexpath(path)
                 continue
-            docid = self.newdocid(path)
+            print "indexing", docid, path
             self.index.index_text(docid, text)
-            done += 1
-            new = 1
-            if done%500 == 0:
-                self.commit()
-                new = 0
-        if new:
-            self.commit()
+            self.maycommit()
+        # Remove messages from the folder that no longer exist
+        for path in self.path2docid.keys(f.name):
+            if not path.startswith(f.name + "/"):
+                break
+            if self.getmtime(path) == 0:
+                self.unindexpath(path)
         print "done."
 
     def unindexpath(self, path):
@@ -334,14 +352,19 @@
             docid = self.path2docid[path]
             print "unindexing", docid, path
             del self.docpaths[docid]
+            del self.doctimes[docid]
             del self.path2docid[path]
             try:
                 self.index.unindex(docid)
             except KeyError, msg:
                 print "KeyError", msg
+            self.maycommit()
 
-    def getmessagetext(self, m):
+    def getmessagetext(self, m, name=None):
         L = []
+        if name:
+            L.append("_folder " + name) # To restrict search to a folder
+            self.getheaders(m, L)
         try:
             self.getmsgparts(m, L, 0)
         except:
@@ -361,22 +384,57 @@
         elif ctype == "message/rfc822":
             f = StringIO(m.getbodytext())
             m = mhlib.Message("<folder>", 0, f)
+            self.getheaders(m, L)
             self.getmsgparts(m, L, level+1)
 
+    def getheaders(self, m, L):
+        H = []
+        for key in "from", "to", "cc", "bcc", "subject":
+            value = m.get(key)
+            if value:
+                H.append(value)
+        if H:
+            L.append("\n".join(H))
+
     def newdocid(self, path):
+        docid = self.path2docid.get(path)
+        if docid is not None:
+            self.doctimes[docid] = self.getmtime(path)
+            return docid
         docid = self.maxdocid + 1
         self.maxdocid = docid
         self.docpaths[docid] = path
+        self.doctimes[docid] = self.getmtime(path)
         self.path2docid[path] = docid
         return docid
 
+    def getmtime(self, path):
+        path = os.path.join(self.mh.getpath(), path)
+        try:
+            st = os.stat(path)
+        except os.error, msg:
+            return 0
+        return st[ST_MTIME]
+
+    def maycommit(self):
+        self.trans_count += 1
+        if self.trans_count >= self.trans_limit > 0:
+            self.commit()
+
     def commit(self):
-        print "committing..."
-        get_transaction().commit()
+        if self.trans_count > 0:
+            print "committing..."
+            get_transaction().commit()
+            self.trans_count = 0
+            self.pack_count += 1
+            if self.pack_count >= self.pack_limit > 0:
+                self.pack()
 
     def pack(self):
-        print "packing..."
-        self.database.pack()
+        if self.pack_count > 0:
+            print "packing..."
+            self.database.pack()
+            self.pack_count = 0
 
 class TextIndex(Persistent):