[Zope-Checkins] CVS: Zope/lib/python/Products/ZCTextIndex - BaseIndex.py:1.28.6.2 CosineIndex.py:1.22.10.1 IIndex.py:1.11.6.1 Lexicon.py:1.17.10.2 OkapiIndex.py:1.29.10.1 ZCTextIndex.py:1.35.2.7

Casey Duncan casey@zope.com
Thu, 5 Jun 2003 16:37:35 -0400


Update of /cvs-repository/Zope/lib/python/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv28323/lib/python/Products/ZCTextIndex

Modified Files:
      Tag: Zope-2_6-branch
	BaseIndex.py CosineIndex.py IIndex.py Lexicon.py OkapiIndex.py 
	ZCTextIndex.py 
Log Message:
Backport casey-zctextindex-fewer-conflicts-branch:

  - Indexes and Lexicon now much less likely to generate write conflicts.
    Previously *any* concurrent index/unindex operation would conflict

  - Performance and scalability fix for queries



=== Zope/lib/python/Products/ZCTextIndex/BaseIndex.py 1.28.6.1 => 1.28.6.2 ===
--- Zope/lib/python/Products/ZCTextIndex/BaseIndex.py:1.28.6.1	Wed Feb 26 12:06:04 2003
+++ Zope/lib/python/Products/ZCTextIndex/BaseIndex.py	Thu Jun  5 16:37:04 2003
@@ -20,7 +20,7 @@
 from BTrees.IOBTree import IOBTree
 from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
 from BTrees.IIBTree import intersection, difference
-import BTrees.Length
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
@@ -83,12 +83,18 @@
         self._docwords = IOBTree()
 
         # Use a BTree length for efficient length computation w/o conflicts
-        self.length = BTrees.Length.Length()
+        self.length = Length()
+        self.document_count = Length()
 
     def length(self):
         """Return the number of words in the index."""
         # This is overridden per instance
         return len(self._wordinfo)
+        
+    def document_count(self):
+        """Return the number of documents in the index"""
+        # This is overridden per instance
+        return len(self._docweight)        
 
     def get_words(self, docid):
         """Return a list of the wordids for a given docid."""
@@ -104,6 +110,11 @@
         self._mass_add_wordinfo(wid2weight, docid)
         self._docweight[docid] = docweight
         self._docwords[docid] = WidCode.encode(wids)
+        try:
+            self.document_count.change(1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
         return len(wids)
 
     # A subclass may wish to extend or override this.  This is for adjusting
@@ -165,6 +176,11 @@
             self._del_wordinfo(wid, docid)
         del self._docwords[docid]
         del self._docweight[docid]
+        try:
+            self.document_count.change(-1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)


=== Zope/lib/python/Products/ZCTextIndex/CosineIndex.py 1.22 => 1.22.10.1 ===
--- Zope/lib/python/Products/ZCTextIndex/CosineIndex.py:1.22	Tue May 28 19:42:20 2002
+++ Zope/lib/python/Products/ZCTextIndex/CosineIndex.py	Thu Jun  5 16:37:04 2003
@@ -69,7 +69,7 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         L = []
         DictType = type({})
         for wid in wids:
@@ -86,7 +86,7 @@
         wids = []
         for term in terms:
             wids += self._lexicon.termToWordIds(term)
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         sum = 0.0
         for wid in self._remove_oov_wids(wids):
             wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)


=== Zope/lib/python/Products/ZCTextIndex/IIndex.py 1.11 => 1.11.6.1 ===
--- Zope/lib/python/Products/ZCTextIndex/IIndex.py:1.11	Wed Aug 14 18:25:14 2002
+++ Zope/lib/python/Products/ZCTextIndex/IIndex.py	Thu Jun  5 16:37:04 2003
@@ -20,6 +20,9 @@
     """Interface for an Index."""
 
     def length():
+        """Return the number of words in the index."""
+        
+    def document_count():
         """Return the number of documents in the index."""
 
     def get_words(docid):
@@ -62,10 +65,13 @@
         """
 
     def index_doc(docid, text):
-        "XXX"
+        """Add a document with the specified id and text to the index. If a
+        document by that id already exists, replace its text with the new
+        text provided
+        """
 
     def unindex_doc(docid):
-        "XXX"
+        """Remove the document with the specified id from the index"""
 
     def has_doc(docid):
         """Returns true if docid is an id of a document in the index"""


=== Zope/lib/python/Products/ZCTextIndex/Lexicon.py 1.17.10.1 => 1.17.10.2 ===
--- Zope/lib/python/Products/ZCTextIndex/Lexicon.py:1.17.10.1	Mon Dec  2 01:08:51 2002
+++ Zope/lib/python/Products/ZCTextIndex/Lexicon.py	Thu Jun  5 16:37:04 2003
@@ -16,6 +16,7 @@
 
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
 
 import ZODB
 from Persistence import Persistent
@@ -37,16 +38,13 @@
         # we never saw before, and that isn't a known stopword (or otherwise
         # filtered out).  Returning a special wid value for OOV words is a
         # way to let clients know when an OOV word appears.
-        self._nextwid = 1
+        self.length = Length()
         self._pipeline = pipeline
 
-        # Keep some statistics about indexing
-        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
-        self._nwords = 0 # Number of words indexed (after pipeline)
-
     def length(self):
         """Return the number of unique terms in the lexicon."""
-        return self._nextwid - 1
+        # Overridden in instances
+        return len(self._wids)
 
     def words(self):
         return self._wids.keys()
@@ -59,11 +57,15 @@
 
     def sourceToWordIds(self, text):
         last = _text2list(text)
-        for t in last:
-            self._nbytes += len(t)
         for element in self._pipeline:
             last = element.process(last)
-        self._nwords += len(last)
+        if not hasattr(self.length, 'change'):
+            # Make sure length is overridden with a BTrees.Length.Length
+            self.length = Length(self.length())        
+        # Strategically unload the length value so that we get the most
+        # recent value written to the database to minimize conflicting wids
+        # XXX this will not work when MVCC is implemented in the ZODB...
+        self.length._p_deactivate()
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):
@@ -138,9 +140,10 @@
         return wid
 
     def _new_wid(self):
-        wid = self._nextwid
-        self._nextwid += 1
-        return wid
+        self.length.change(1)
+        while self._words.has_key(self.length()): # just to be safe
+            self.length.change(1)
+        return self.length()
 
 def _text2list(text):
     # Helper: splitter input may be a string or a list of strings


=== Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py 1.29 => 1.29.10.1 ===
--- Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py:1.29	Wed May 29 16:47:44 2002
+++ Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py	Thu Jun  5 16:37:04 2003
@@ -18,6 +18,7 @@
 # understand what's going on.
 
 from BTrees.IIBTree import IIBucket
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex.BaseIndex import BaseIndex, \
@@ -50,20 +51,29 @@
         # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
-        self._totaldoclen = 0L
+        # Use a BTree.Length.Length object to avoid concurrent write conflicts
+        self._totaldoclen = Length(0L)
 
     def index_doc(self, docid, text):
         count = BaseIndex.index_doc(self, docid, text)
-        self._totaldoclen += count
+        self._change_doc_len(count)
         return count
 
     def _reindex_doc(self, docid, text):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         return BaseIndex._reindex_doc(self, docid, text)
 
     def unindex_doc(self, docid):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         BaseIndex.unindex_doc(self, docid)
+    
+    def _change_doc_len(self, delta):
+        # Change total doc length used for scoring
+        try:
+            self._totaldoclen.change(delta)
+        except AttributeError:
+            # Opportunistically upgrade _totaldoclen attribute to Length object
+            self._totaldoclen = Length(long(self._totaldoclen + delta))
 
     # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
     # for each wid t in wids.  The IIBucket, times the weight, maps D to
@@ -76,8 +86,13 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         K1 = self.K1
         B = self.B
         K1_plus1 = K1 + 1.0
@@ -120,8 +135,13 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         #K1 = self.K1
         #B = self.B
         #K1_plus1 = K1 + 1.0


=== Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py 1.35.2.6 => 1.35.2.7 ===
--- Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py:1.35.2.6	Fri Feb 28 17:19:22 2003
+++ Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py	Thu Jun  5 16:37:04 2003
@@ -148,7 +148,7 @@
 
     ## Pluggable Index APIs ##
 
-    def index_object(self, docid, obj, threshold=None):
+    def index_object(self, docid, obj, threshold=None, attr=None):
         # XXX We currently ignore subtransaction threshold
         text = getattr(obj, self._fieldname, None)
         if text is None:
@@ -158,13 +158,11 @@
         if text is None:
             return 0
         count = self.index.index_doc(docid, text)
-        self._p_changed = 1 # XXX
         return count
 
     def unindex_object(self, docid):
         if self.index.has_doc(docid):
             self.index.unindex_doc(docid)
-            self._p_changed = 1 # XXX
 
     def _apply_index(self, request, cid=''):
         """Apply query specified by request, a mapping containing the query.
@@ -183,7 +181,7 @@
             return None
         tree = QueryParser(self.getLexicon()).parseQuery(query_str)
         results = tree.executeQuery(self.index)
-        return  results, (self._fieldname,)
+        return  results, (self.id,)
 
     def getEntryForObject(self, documentId, default=None):
         """Return the list of words indexed for documentId"""
@@ -217,13 +215,13 @@
 
     manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
 
-    def getIndexType(self):
-        """Return index type string"""
-        return getattr(self, '_index_type', self._index_factory.__name__)
-
     def getFieldName(self):
         """Return indexed attribute name"""
         return self._fieldname
+
+    def getIndexType(self):
+        """Return index type string"""
+        return getattr(self, '_index_type', self._index_factory.__name__)
 
     def getLexiconURL(self):
         """Return the url of the lexicon used by the index"""