[Zope-Checkins] CVS: Zope/lib/python/Products/ZCTextIndex - BaseIndex.py:1.28.6.2 CosineIndex.py:1.22.10.1 IIndex.py:1.11.6.1 Lexicon.py:1.17.10.2 OkapiIndex.py:1.29.10.1 ZCTextIndex.py:1.35.2.7
Casey Duncan
casey@zope.com
Thu, 5 Jun 2003 16:37:35 -0400
Update of /cvs-repository/Zope/lib/python/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv28323/lib/python/Products/ZCTextIndex
Modified Files:
Tag: Zope-2_6-branch
BaseIndex.py CosineIndex.py IIndex.py Lexicon.py OkapiIndex.py
ZCTextIndex.py
Log Message:
Backport casey-zctextindex-fewer-conflicts-branch:
- Indexes and Lexicon now much less likely to generate write conflicts.
Previously *any* concurrent index/unindex operation would conflict
- Performance and scalability fix for queries
=== Zope/lib/python/Products/ZCTextIndex/BaseIndex.py 1.28.6.1 => 1.28.6.2 ===
--- Zope/lib/python/Products/ZCTextIndex/BaseIndex.py:1.28.6.1 Wed Feb 26 12:06:04 2003
+++ Zope/lib/python/Products/ZCTextIndex/BaseIndex.py Thu Jun 5 16:37:04 2003
@@ -20,7 +20,7 @@
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference
-import BTrees.Length
+from BTrees.Length import Length
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
@@ -83,12 +83,18 @@
self._docwords = IOBTree()
# Use a BTree length for efficient length computation w/o conflicts
- self.length = BTrees.Length.Length()
+ self.length = Length()
+ self.document_count = Length()
def length(self):
"""Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo)
+
+ def document_count(self):
+ """Return the number of documents in the index"""
+ # This is overridden per instance
+ return len(self._docweight)
def get_words(self, docid):
"""Return a list of the wordids for a given docid."""
@@ -104,6 +110,11 @@
self._mass_add_wordinfo(wid2weight, docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
+ try:
+ self.document_count.change(1)
+ except AttributeError:
+ # Upgrade document_count to Length object
+ self.document_count = Length(self.document_count())
return len(wids)
# A subclass may wish to extend or override this. This is for adjusting
@@ -165,6 +176,11 @@
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
+ try:
+ self.document_count.change(-1)
+ except AttributeError:
+ # Upgrade document_count to Length object
+ self.document_count = Length(self.document_count())
def search(self, term):
wids = self._lexicon.termToWordIds(term)
=== Zope/lib/python/Products/ZCTextIndex/CosineIndex.py 1.22 => 1.22.10.1 ===
--- Zope/lib/python/Products/ZCTextIndex/CosineIndex.py:1.22 Tue May 28 19:42:20 2002
+++ Zope/lib/python/Products/ZCTextIndex/CosineIndex.py Thu Jun 5 16:37:04 2003
@@ -69,7 +69,7 @@
def _search_wids(self, wids):
if not wids:
return []
- N = float(len(self._docweight))
+ N = float(self.document_count())
L = []
DictType = type({})
for wid in wids:
@@ -86,7 +86,7 @@
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
- N = float(len(self._docweight))
+ N = float(self.document_count())
sum = 0.0
for wid in self._remove_oov_wids(wids):
wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
=== Zope/lib/python/Products/ZCTextIndex/IIndex.py 1.11 => 1.11.6.1 ===
--- Zope/lib/python/Products/ZCTextIndex/IIndex.py:1.11 Wed Aug 14 18:25:14 2002
+++ Zope/lib/python/Products/ZCTextIndex/IIndex.py Thu Jun 5 16:37:04 2003
@@ -20,6 +20,9 @@
"""Interface for an Index."""
def length():
+ """Return the number of words in the index."""
+
+ def document_count():
"""Return the number of documents in the index."""
def get_words(docid):
@@ -62,10 +65,13 @@
"""
def index_doc(docid, text):
- "XXX"
+ """Add a document with the specified id and text to the index. If a
+ document by that id already exists, replace its text with the new
+ text provided
+ """
def unindex_doc(docid):
- "XXX"
+ """Remove the document with the specified id from the index"""
def has_doc(docid):
"""Returns true if docid is an id of a document in the index"""
=== Zope/lib/python/Products/ZCTextIndex/Lexicon.py 1.17.10.1 => 1.17.10.2 ===
--- Zope/lib/python/Products/ZCTextIndex/Lexicon.py:1.17.10.1 Mon Dec 2 01:08:51 2002
+++ Zope/lib/python/Products/ZCTextIndex/Lexicon.py Thu Jun 5 16:37:04 2003
@@ -16,6 +16,7 @@
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
import ZODB
from Persistence import Persistent
@@ -37,16 +38,13 @@
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
- self._nextwid = 1
+ self.length = Length()
self._pipeline = pipeline
- # Keep some statistics about indexing
- self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
- self._nwords = 0 # Number of words indexed (after pipeline)
-
def length(self):
"""Return the number of unique terms in the lexicon."""
- return self._nextwid - 1
+ # Overridden in instances
+ return len(self._wids)
def words(self):
return self._wids.keys()
@@ -59,11 +57,15 @@
def sourceToWordIds(self, text):
last = _text2list(text)
- for t in last:
- self._nbytes += len(t)
for element in self._pipeline:
last = element.process(last)
- self._nwords += len(last)
+ if not hasattr(self.length, 'change'):
+ # Make sure length is overridden with a BTrees.Length.Length
+ self.length = Length(self.length())
+ # Strategically unload the length value so that we get the most
+ # recent value written to the database to minimize conflicting wids
+ # XXX this will not work when MVCC is implemented in the ZODB...
+ self.length._p_deactivate()
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
@@ -138,9 +140,10 @@
return wid
def _new_wid(self):
- wid = self._nextwid
- self._nextwid += 1
- return wid
+ self.length.change(1)
+ while self._words.has_key(self.length()): # just to be safe
+ self.length.change(1)
+ return self.length()
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
=== Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py 1.29 => 1.29.10.1 ===
--- Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py:1.29 Wed May 29 16:47:44 2002
+++ Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py Thu Jun 5 16:37:04 2003
@@ -18,6 +18,7 @@
# understand what's going on.
from BTrees.IIBTree import IIBucket
+from BTrees.Length import Length
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex.BaseIndex import BaseIndex, \
@@ -50,20 +51,29 @@
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
- self._totaldoclen = 0L
+ # Use a BTree.Length.Length object to avoid concurrent write conflicts
+ self._totaldoclen = Length(0L)
def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text)
- self._totaldoclen += count
+ self._change_doc_len(count)
return count
def _reindex_doc(self, docid, text):
- self._totaldoclen -= self._docweight[docid]
+ self._change_doc_len(-self._docweight[docid])
return BaseIndex._reindex_doc(self, docid, text)
def unindex_doc(self, docid):
- self._totaldoclen -= self._docweight[docid]
+ self._change_doc_len(-self._docweight[docid])
BaseIndex.unindex_doc(self, docid)
+
+ def _change_doc_len(self, delta):
+ # Change total doc length used for scoring
+ try:
+ self._totaldoclen.change(delta)
+ except AttributeError:
+ # Opportunistically upgrade _totaldoclen attribute to Length object
+ self._totaldoclen = Length(long(self._totaldoclen + delta))
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
@@ -76,8 +86,13 @@
def _search_wids(self, wids):
if not wids:
return []
- N = float(len(self._docweight)) # total # of docs
- meandoclen = self._totaldoclen / N
+ N = float(self.document_count()) # total # of docs
+ try:
+ doclen = self._totaldoclen()
+ except TypeError:
+ # _totaldoclen has not yet been upgraded
+ doclen = self._totaldoclen
+ meandoclen = doclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
@@ -120,8 +135,13 @@
def _search_wids(self, wids):
if not wids:
return []
- N = float(len(self._docweight)) # total # of docs
- meandoclen = self._totaldoclen / N
+ N = float(self.document_count()) # total # of docs
+ try:
+ doclen = self._totaldoclen()
+ except TypeError:
+ # _totaldoclen has not yet been upgraded
+ doclen = self._totaldoclen
+ meandoclen = doclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
=== Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py 1.35.2.6 => 1.35.2.7 ===
--- Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py:1.35.2.6 Fri Feb 28 17:19:22 2003
+++ Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py Thu Jun 5 16:37:04 2003
@@ -148,7 +148,7 @@
## Pluggable Index APIs ##
- def index_object(self, docid, obj, threshold=None):
+ def index_object(self, docid, obj, threshold=None, attr=None):
# XXX We currently ignore subtransaction threshold
text = getattr(obj, self._fieldname, None)
if text is None:
@@ -158,13 +158,11 @@
if text is None:
return 0
count = self.index.index_doc(docid, text)
- self._p_changed = 1 # XXX
return count
def unindex_object(self, docid):
if self.index.has_doc(docid):
self.index.unindex_doc(docid)
- self._p_changed = 1 # XXX
def _apply_index(self, request, cid=''):
"""Apply query specified by request, a mapping containing the query.
@@ -183,7 +181,7 @@
return None
tree = QueryParser(self.getLexicon()).parseQuery(query_str)
results = tree.executeQuery(self.index)
- return results, (self._fieldname,)
+ return results, (self.id,)
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
@@ -217,13 +215,13 @@
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
- def getIndexType(self):
- """Return index type string"""
- return getattr(self, '_index_type', self._index_factory.__name__)
-
def getFieldName(self):
"""Return indexed attribute name"""
return self._fieldname
+
+ def getIndexType(self):
+ """Return index type string"""
+ return getattr(self, '_index_type', self._index_factory.__name__)
def getLexiconURL(self):
"""Return the url of the lexicon used by the index"""