[Zope3-checkins] SVN: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/ Refactored the text index to implement IIndexSearch, rather than

Tue Dec 7 17:36:19 EST 2004

Log message for revision 28580:
  Refactored the text index to implement IIndexSearch, rather than
  IQuerying. Also renamed TextIndexWrapper to TextIndex. (Wrapper was
  confusing.)
  

Changed:
  U   Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py
  U   Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py
  A   Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py
  A   Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
  D   Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py

-=-
Modified: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py
===================================================================

--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py	2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py	2004-12-07 22:36:18 UTC (rev 28580)
@@ -1 +1 @@
-from zope.index.text.textindexwrapper import TextIndexWrapper as TextIndex
+from zope.index.text.textindex import TextIndex

Modified: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py	2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py	2004-12-07 22:36:18 UTC (rev 28580)
@@ -18,119 +18,9 @@
 
 import unittest
 
-from zope.index.text.textindexwrapper import TextIndexWrapper
-from zope.index.text import parsetree
-
-class TextIndexWrapperTest(unittest.TestCase):
-
-    def setUp(self):
-        w = TextIndexWrapper()
-        doc = u"the quick brown fox jumps over the lazy dog"
-        w.index_doc(1000, [doc])
-        doc = u"the brown fox and the yellow fox don't need the retriever"
-        w.index_doc(1001, [doc])
-        self.wrapper = w
-
-    def test_clear(self):
-        self.wrapper.clear()
-        self.assertEqual(self.wrapper.documentCount(), 0)
-        self.assertEqual(self.wrapper.wordCount(), 0)
-
-    def testCounts(self):
-        w = self.wrapper
-        self.assertEqual(self.wrapper.documentCount(), 2)
-        self.assertEqual(self.wrapper.wordCount(), 12)
-        doc = u"foo bar"
-        w.index_doc(1002, [doc])
-        self.assertEqual(self.wrapper.documentCount(), 3)
-        self.assertEqual(self.wrapper.wordCount(), 14)
-
-    def testOne(self):
-        matches, total = self.wrapper.query(u"quick fox", 0, 10)
-        self.assertEqual(total, 1)
-        [(docid, rank)] = matches # if this fails there's a problem
-        self.assertEqual(docid, 1000)
-
-    def testDefaultBatch(self):
-        matches, total = self.wrapper.query(u"fox", 0)
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches), 2)
-        matches, total = self.wrapper.query(u"fox")
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches), 2)
-        matches, total = self.wrapper.query(u" fox", 1)
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches), 1)
-
-    def testGlobbing(self):
-        matches, total = self.wrapper.query("fo*")
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches), 2)
-
-    def testLatin1(self):
-        w = self.wrapper
-        doc = u"Fran\xe7ois"
-        w.index_doc(1002, [doc])
-        matches, total = self.wrapper.query(doc, 0, 10)
-        self.assertEqual(total, 1)
-        [(docid, rank)] = matches # if this fails there's a problem
-        self.assertEqual(docid, 1002)
-
-    def testUnicode(self):
-        w = self.wrapper
-        # Verbose, but easy to debug
-        delta  = u"\N{GREEK SMALL LETTER DELTA}"
-        delta += u"\N{GREEK SMALL LETTER EPSILON}"
-        delta += u"\N{GREEK SMALL LETTER LAMDA}"
-        delta += u"\N{GREEK SMALL LETTER TAU}"
-        delta += u"\N{GREEK SMALL LETTER ALPHA}"
-        self.assert_(delta.islower())
-        emdash = u"\N{EM DASH}"
-        self.assert_(not emdash.isalnum())
-        alpha  = u"\N{GREEK SMALL LETTER ALPHA}"
-        self.assert_(alpha.islower())
-        lamda  = u"\N{GREEK SMALL LETTER LAMDA}"
-        lamda += u"\N{GREEK SMALL LETTER ALPHA}"
-        self.assert_(lamda.islower())
-        doc = delta + emdash + alpha
-        w.index_doc(1002, [doc])
-        for word in delta, alpha:
-            matches, total = self.wrapper.query(word, 0, 10)
-            self.assertEqual(total, 1)
-            [(docid, rank)] = matches # if this fails there's a problem
-            self.assertEqual(docid, 1002)
-        self.assertRaises(parsetree.ParseError,
-                          self.wrapper.query, emdash, 0, 10)
-        matches, total = self.wrapper.query(lamda, 0, 10)
-        self.assertEqual(total, 0)
-
-    def testNone(self):
-        matches, total = self.wrapper.query(u"dalmatian", 0, 10)
-        self.assertEqual(total, 0)
-        self.assertEqual(len(matches), 0)
-
-    def testAll(self):
-        matches, total = self.wrapper.query(u"brown fox", 0, 10)
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches), 2)
-        matches.sort()
-        self.assertEqual(matches[0][0], 1000)
-        self.assertEqual(matches[1][0], 1001)
-
-    def testBatching(self):
-        matches1, total = self.wrapper.query(u"brown fox", 0, 1)
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches1), 1)
-        matches2, total = self.wrapper.query(u"brown fox", 1, 1)
-        self.assertEqual(total, 2)
-        self.assertEqual(len(matches2), 1)
-        matches = matches1 + matches2
-        matches.sort()
-        self.assertEqual(matches[0][0], 1000)
-        self.assertEqual(matches[1][0], 1001)
-
 def test_suite():
-    return unittest.makeSuite(TextIndexWrapperTest)
-
+    from zope.testing import doctest
+    return doctest.DocFileSuite("../textindex.txt")
+    
 if __name__=='__main__':
     unittest.main(defaultTest='test_suite')

Copied: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py (from rev 28577, Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py)
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py	2004-12-07 18:15:57 UTC (rev 28577)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py	2004-12-07 22:36:18 UTC (rev 28580)
@@ -0,0 +1,89 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Text index wrapper.
+
+This exists to implement IInjection and IQuerying.
+
+$Id$
+"""
+
+from persistent import Persistent
+from zope.interface import implements
+
+from zope.index.text.baseindex import SCALE_FACTOR
+from zope.index.text.okapiindex import OkapiIndex
+from zope.index.text.lexicon import Lexicon
+from zope.index.text.lexicon import Splitter, CaseNormalizer, StopWordRemover
+from zope.index.text.queryparser import QueryParser
+
+from zope.index.interfaces import IInjection, IIndexSearch, IStatistics
+
+class TextIndex(Persistent):
+
+    implements(IInjection, IIndexSearch, IStatistics)
+
+    def __init__(self, lexicon=None, index=None):
+        """Provisional constructor.
+
+        This creates the lexicon and index if not passed in."""
+        if lexicon is None:
+            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+        if index is None:
+            index = OkapiIndex(lexicon)
+        self.lexicon = lexicon
+        self.index = index
+
+    def index_doc(self, docid, text):
+        self.index.index_doc(docid, text)
+
+    def unindex_doc(self, docid):
+        self.index.unindex_doc(docid)
+
+    def clear(self):
+        self.index.clear()
+
+    def documentCount(self):
+        """Return the number of documents in the index."""
+        return self.index.documentCount()
+
+    def wordCount(self):
+        """Return the number of words in the index."""
+        return self.index.wordCount()
+
+    def apply(self, querytext, start=0, count=None):
+        parser = QueryParser(self.lexicon)
+        tree = parser.parseQuery(querytext)
+        results = tree.executeQuery(self.index)
+        if results:
+            qw = self.index.query_weight(tree.terms())
+            
+            # Hack to avoid ZeroDivisionError
+            if qw < SCALE_FACTOR:
+                qw = SCALE_FACTOR
+
+            # TODO we should seriously consider using float
+            # scores. Since we are using ints. we'll scale this
+            # result to get integers other than zero.  We'll use
+            # 100 so we can pretend this is a percent. ;)
+            qw *= .01
+
+            for docid, score in results.iteritems():
+                try:
+                    results[docid] = int(score/qw)
+                except TypeError:
+                    # We overflowed the score, perhaps wildly unlikely.
+                    # Who knows.
+                    results[docid] = sys.maxint/10
+
+        return results

Added: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt	2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt	2004-12-07 22:36:18 UTC (rev 28580)
@@ -0,0 +1,109 @@
+Text Indexes
+============
+
+Text indexes combine an inverted index and a lexicon to support text
+indexing and searching.  A text index can be created without passing
+any arguments:
+
+    >>> from zope.index.text.textindex import TextIndex
+    >>> index = TextIndex()
+
+By default, it uses an "Okapi" inverted index and a lexicon with a
+pipeline consistening is a simple word splitter, a case normalizer,
+and a stop-word remover.
+
+We index text using the `index_doc` method:
+
+    >>> index.index_doc(1, u"the quick brown fox jumps over the lazy dog")
+    >>> index.index_doc(2,
+    ...    u"the brown fox and the yellow fox don't need the retriever")
+    >>> index.index_doc(3, u"""
+    ... The Conservation Pledge
+    ... =======================
+    ... 
+    ... I give my pledge, as an American, to save, and faithfully
+    ... to defent from waste, the natural resources of my Country; 
+    ... it's soils, minerals, forests, waters and wildlife.
+    ... """)
+    >>> index.index_doc(4, u"Fran\xe7ois") 
+    >>> word = (
+    ...     u"\N{GREEK SMALL LETTER DELTA}"
+    ...     u"\N{GREEK SMALL LETTER EPSILON}"
+    ...     u"\N{GREEK SMALL LETTER LAMDA}"
+    ...     u"\N{GREEK SMALL LETTER TAU}"
+    ...     u"\N{GREEK SMALL LETTER ALPHA}"
+    ...     )
+    >>> index.index_doc(5, word + u"\N{EM DASH}\N{GREEK SMALL LETTER ALPHA}")
+    >>> index.index_doc(6, u"""
+    ... What we have here, is a failure to communicate.
+    ... """)
+    >>> index.index_doc(7, u"""
+    ... Hold on to your butts!
+    ... """)
+    >>> index.index_doc(8, u"""
+    ... The Zen of Python, by Tim Peters
+    ... 
+    ... Beautiful is better than ugly.
+    ... Explicit is better than implicit.
+    ... Simple is better than complex.
+    ... Complex is better than complicated.
+    ... Flat is better than nested.
+    ... Sparse is better than dense.
+    ... Readability counts.
+    ... Special cases aren't special enough to break the rules.
+    ... Although practicality beats purity.
+    ... Errors should never pass silently.
+    ... Unless explicitly silenced.
+    ... In the face of ambiguity, refuse the temptation to guess.
+    ... There should be one-- and preferably only one --obvious way to do it.
+    ... Although that way may not be obvious at first unless you're Dutch.
+    ... Now is better than never.
+    ... Although never is often better than *right* now.
+    ... If the implementation is hard to explain, it's a bad idea.
+    ... If the implementation is easy to explain, it may be a good idea.
+    ... Namespaces are one honking great idea -- let's do more of those!
+    ... """)
+
+Then we can search using the apply method, which takes a search
+string:
+
+    >>> index.apply(u'brown fox')
+    BTrees._IIBTree.IIBucket([(1, 61), (2, 67)])
+
+    >>> index.apply(u'quick fox')
+    BTrees._IIBTree.IIBucket([(1, 61)])
+
+    >>> index.apply(u'brown python')
+    BTrees._IIBTree.IIBucket([])
+
+    >>> index.apply(u'dalmatian')
+    BTrees._IIBTree.IIBucket([])
+
+    >>> index.apply(u'brown or python')
+    BTrees._IIBTree.IIBucket([(1, 26), (2, 25), (8, 9)])
+
+    >>> index.apply(u'butts')
+    BTrees._IIBTree.IIBucket([(7, 69)])
+
+The outputs are mappings from document ids to integer scored. Items
+with higher scores are more relevent.
+
+We can use unicode characters in search strings:
+
+    >>> index.apply(u"Fran\xe7ois")
+    BTrees._IIBTree.IIBucket([(4, 74)])
+
+    >>> index.apply(word)
+    BTrees._IIBTree.IIBucket([(5, 71)])
+
+We can use globbing in search strings:
+
+    >>> index.apply('fo*')
+    BTrees._IIBTree.IIBucket([(1, 217), (2, 265), (3, 204)])
+
+Text indexes support basic statistics:
+
+    >>> index.documentCount()
+    8
+    >>> index.wordCount()
+    114


Property changes on: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
___________________________________________________________________
Name: svn:eol-style
   + native

Deleted: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py	2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py	2004-12-07 22:36:18 UTC (rev 28580)
@@ -1,89 +0,0 @@
-##############################################################################
-#
-# Copyright (c) 2002 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Text index wrapper.
-
-This exists to implement IInjection and IQuerying.
-
-$Id$
-"""
-
-from persistent import Persistent
-from zope.interface import implements
-
-from zope.index.text.okapiindex import OkapiIndex
-from zope.index.text.lexicon import Lexicon
-from zope.index.text.lexicon import Splitter, CaseNormalizer, StopWordRemover
-from zope.index.text.queryparser import QueryParser
-from zope.index.nbest import NBest
-
-from zope.index.interfaces import IInjection, IQuerying, IStatistics
-
-class TextIndexWrapper(Persistent):
-
-    implements(IInjection, IQuerying, IStatistics)
-
-    def __init__(self, lexicon=None, index=None):
-        """Provisional constructor.
-
-        This creates the lexicon and index if not passed in."""
-        if lexicon is None:
-            lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
-        if index is None:
-            index = OkapiIndex(lexicon)
-        self.lexicon = lexicon
-        self.index = index
-
-    # Methods implementing IInjection
-
-    def index_doc(self, docid, text):
-        self.index.index_doc(docid, text)
-
-    def unindex_doc(self, docid):
-        self.index.unindex_doc(docid)
-
-    def clear(self):
-        self.index.clear()
-
-    # Methods implementing IQuerying
-
-    def query(self, querytext, start=0, count=None):
-        parser = QueryParser(self.lexicon)
-        tree = parser.parseQuery(querytext)
-        results = tree.executeQuery(self.index)
-        if not results:
-            return [], 0
-        if count is None:
-            count = max(0, len(results) - start)
-        chooser = NBest(start + count)
-        chooser.addmany(results.items())
-        batch = chooser.getbest()
-        batch = batch[start:]
-        if batch:
-            qw = self.index.query_weight(tree.terms())
-            # Hack to avoid ZeroDivisionError
-            if qw == 0:
-                qw = batch[0][1] or 1
-            qw *= 1.0
-            batch = [(docid, score/qw) for docid, score in batch]
-        return batch, len(results)
-
-    # Methods implementing IStatistics
-
-    def documentCount(self):
-        """Return the number of documents in the index."""
-        return self.index.documentCount()
-
-    def wordCount(self):
-        """Return the number of words in the index."""
-        return self.index.wordCount()