[Zope3-checkins]
SVN: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/
Refactored the text index to implement IIndexSearch, rather than
Jim Fulton
jim at zope.com
Tue Dec 7 17:36:19 EST 2004
Log message for revision 28580:
Refactored the text index to implement IIndexSearch, rather than
IQuerying. Also renamed TextIndexWrapper to TextIndex. (Wrapper was
confusing.)
Changed:
U Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py
U Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py
A Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py
A Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
D Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py
-=-
Modified: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py 2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/__init__.py 2004-12-07 22:36:18 UTC (rev 28580)
@@ -1 +1 @@
-from zope.index.text.textindexwrapper import TextIndexWrapper as TextIndex
+from zope.index.text.textindex import TextIndex
Modified: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py 2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/tests/test_textindexwrapper.py 2004-12-07 22:36:18 UTC (rev 28580)
@@ -18,119 +18,9 @@
import unittest
-from zope.index.text.textindexwrapper import TextIndexWrapper
-from zope.index.text import parsetree
-
-class TextIndexWrapperTest(unittest.TestCase):
-
- def setUp(self):
- w = TextIndexWrapper()
- doc = u"the quick brown fox jumps over the lazy dog"
- w.index_doc(1000, [doc])
- doc = u"the brown fox and the yellow fox don't need the retriever"
- w.index_doc(1001, [doc])
- self.wrapper = w
-
- def test_clear(self):
- self.wrapper.clear()
- self.assertEqual(self.wrapper.documentCount(), 0)
- self.assertEqual(self.wrapper.wordCount(), 0)
-
- def testCounts(self):
- w = self.wrapper
- self.assertEqual(self.wrapper.documentCount(), 2)
- self.assertEqual(self.wrapper.wordCount(), 12)
- doc = u"foo bar"
- w.index_doc(1002, [doc])
- self.assertEqual(self.wrapper.documentCount(), 3)
- self.assertEqual(self.wrapper.wordCount(), 14)
-
- def testOne(self):
- matches, total = self.wrapper.query(u"quick fox", 0, 10)
- self.assertEqual(total, 1)
- [(docid, rank)] = matches # if this fails there's a problem
- self.assertEqual(docid, 1000)
-
- def testDefaultBatch(self):
- matches, total = self.wrapper.query(u"fox", 0)
- self.assertEqual(total, 2)
- self.assertEqual(len(matches), 2)
- matches, total = self.wrapper.query(u"fox")
- self.assertEqual(total, 2)
- self.assertEqual(len(matches), 2)
- matches, total = self.wrapper.query(u" fox", 1)
- self.assertEqual(total, 2)
- self.assertEqual(len(matches), 1)
-
- def testGlobbing(self):
- matches, total = self.wrapper.query("fo*")
- self.assertEqual(total, 2)
- self.assertEqual(len(matches), 2)
-
- def testLatin1(self):
- w = self.wrapper
- doc = u"Fran\xe7ois"
- w.index_doc(1002, [doc])
- matches, total = self.wrapper.query(doc, 0, 10)
- self.assertEqual(total, 1)
- [(docid, rank)] = matches # if this fails there's a problem
- self.assertEqual(docid, 1002)
-
- def testUnicode(self):
- w = self.wrapper
- # Verbose, but easy to debug
- delta = u"\N{GREEK SMALL LETTER DELTA}"
- delta += u"\N{GREEK SMALL LETTER EPSILON}"
- delta += u"\N{GREEK SMALL LETTER LAMDA}"
- delta += u"\N{GREEK SMALL LETTER TAU}"
- delta += u"\N{GREEK SMALL LETTER ALPHA}"
- self.assert_(delta.islower())
- emdash = u"\N{EM DASH}"
- self.assert_(not emdash.isalnum())
- alpha = u"\N{GREEK SMALL LETTER ALPHA}"
- self.assert_(alpha.islower())
- lamda = u"\N{GREEK SMALL LETTER LAMDA}"
- lamda += u"\N{GREEK SMALL LETTER ALPHA}"
- self.assert_(lamda.islower())
- doc = delta + emdash + alpha
- w.index_doc(1002, [doc])
- for word in delta, alpha:
- matches, total = self.wrapper.query(word, 0, 10)
- self.assertEqual(total, 1)
- [(docid, rank)] = matches # if this fails there's a problem
- self.assertEqual(docid, 1002)
- self.assertRaises(parsetree.ParseError,
- self.wrapper.query, emdash, 0, 10)
- matches, total = self.wrapper.query(lamda, 0, 10)
- self.assertEqual(total, 0)
-
- def testNone(self):
- matches, total = self.wrapper.query(u"dalmatian", 0, 10)
- self.assertEqual(total, 0)
- self.assertEqual(len(matches), 0)
-
- def testAll(self):
- matches, total = self.wrapper.query(u"brown fox", 0, 10)
- self.assertEqual(total, 2)
- self.assertEqual(len(matches), 2)
- matches.sort()
- self.assertEqual(matches[0][0], 1000)
- self.assertEqual(matches[1][0], 1001)
-
- def testBatching(self):
- matches1, total = self.wrapper.query(u"brown fox", 0, 1)
- self.assertEqual(total, 2)
- self.assertEqual(len(matches1), 1)
- matches2, total = self.wrapper.query(u"brown fox", 1, 1)
- self.assertEqual(total, 2)
- self.assertEqual(len(matches2), 1)
- matches = matches1 + matches2
- matches.sort()
- self.assertEqual(matches[0][0], 1000)
- self.assertEqual(matches[1][0], 1001)
-
def test_suite():
- return unittest.makeSuite(TextIndexWrapperTest)
-
+ from zope.testing import doctest
+ return doctest.DocFileSuite("../textindex.txt")
+
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
Copied: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py (from rev 28577, Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py)
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py 2004-12-07 18:15:57 UTC (rev 28577)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.py 2004-12-07 22:36:18 UTC (rev 28580)
@@ -0,0 +1,89 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Text index wrapper.
+
+This exists to implement IInjection and IQuerying.
+
+$Id$
+"""
+
+from persistent import Persistent
+from zope.interface import implements
+
+from zope.index.text.baseindex import SCALE_FACTOR
+from zope.index.text.okapiindex import OkapiIndex
+from zope.index.text.lexicon import Lexicon
+from zope.index.text.lexicon import Splitter, CaseNormalizer, StopWordRemover
+from zope.index.text.queryparser import QueryParser
+
+from zope.index.interfaces import IInjection, IIndexSearch, IStatistics
+
+class TextIndex(Persistent):
+
+ implements(IInjection, IIndexSearch, IStatistics)
+
+ def __init__(self, lexicon=None, index=None):
+ """Provisional constructor.
+
+ This creates the lexicon and index if not passed in."""
+ if lexicon is None:
+ lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+ if index is None:
+ index = OkapiIndex(lexicon)
+ self.lexicon = lexicon
+ self.index = index
+
+ def index_doc(self, docid, text):
+ self.index.index_doc(docid, text)
+
+ def unindex_doc(self, docid):
+ self.index.unindex_doc(docid)
+
+ def clear(self):
+ self.index.clear()
+
+ def documentCount(self):
+ """Return the number of documents in the index."""
+ return self.index.documentCount()
+
+ def wordCount(self):
+ """Return the number of words in the index."""
+ return self.index.wordCount()
+
+ def apply(self, querytext, start=0, count=None):
+ parser = QueryParser(self.lexicon)
+ tree = parser.parseQuery(querytext)
+ results = tree.executeQuery(self.index)
+ if results:
+ qw = self.index.query_weight(tree.terms())
+
+ # Hack to avoid ZeroDivisionError
+ if qw < SCALE_FACTOR:
+ qw = SCALE_FACTOR
+
+ # TODO we should seriously consider using float
+ # scores. Since we are using ints. we'll scale this
+ # result to get integers other than zero. We'll use
+ # 100 so we can pretend this is a percent. ;)
+ qw *= .01
+
+ for docid, score in results.iteritems():
+ try:
+ results[docid] = int(score/qw)
+ except TypeError:
+ # We overflowed the score, perhaps wildly unlikely.
+ # Who knows.
+ results[docid] = sys.maxint/10
+
+ return results
Added: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt 2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt 2004-12-07 22:36:18 UTC (rev 28580)
@@ -0,0 +1,109 @@
+Text Indexes
+============
+
+Text indexes combine an inverted index and a lexicon to support text
+indexing and searching. A text index can be created without passing
+any arguments:
+
+ >>> from zope.index.text.textindex import TextIndex
+ >>> index = TextIndex()
+
+By default, it uses an "Okapi" inverted index and a lexicon with a
+pipeline consistening is a simple word splitter, a case normalizer,
+and a stop-word remover.
+
+We index text using the `index_doc` method:
+
+ >>> index.index_doc(1, u"the quick brown fox jumps over the lazy dog")
+ >>> index.index_doc(2,
+ ... u"the brown fox and the yellow fox don't need the retriever")
+ >>> index.index_doc(3, u"""
+ ... The Conservation Pledge
+ ... =======================
+ ...
+ ... I give my pledge, as an American, to save, and faithfully
+ ... to defent from waste, the natural resources of my Country;
+ ... it's soils, minerals, forests, waters and wildlife.
+ ... """)
+ >>> index.index_doc(4, u"Fran\xe7ois")
+ >>> word = (
+ ... u"\N{GREEK SMALL LETTER DELTA}"
+ ... u"\N{GREEK SMALL LETTER EPSILON}"
+ ... u"\N{GREEK SMALL LETTER LAMDA}"
+ ... u"\N{GREEK SMALL LETTER TAU}"
+ ... u"\N{GREEK SMALL LETTER ALPHA}"
+ ... )
+ >>> index.index_doc(5, word + u"\N{EM DASH}\N{GREEK SMALL LETTER ALPHA}")
+ >>> index.index_doc(6, u"""
+ ... What we have here, is a failure to communicate.
+ ... """)
+ >>> index.index_doc(7, u"""
+ ... Hold on to your butts!
+ ... """)
+ >>> index.index_doc(8, u"""
+ ... The Zen of Python, by Tim Peters
+ ...
+ ... Beautiful is better than ugly.
+ ... Explicit is better than implicit.
+ ... Simple is better than complex.
+ ... Complex is better than complicated.
+ ... Flat is better than nested.
+ ... Sparse is better than dense.
+ ... Readability counts.
+ ... Special cases aren't special enough to break the rules.
+ ... Although practicality beats purity.
+ ... Errors should never pass silently.
+ ... Unless explicitly silenced.
+ ... In the face of ambiguity, refuse the temptation to guess.
+ ... There should be one-- and preferably only one --obvious way to do it.
+ ... Although that way may not be obvious at first unless you're Dutch.
+ ... Now is better than never.
+ ... Although never is often better than *right* now.
+ ... If the implementation is hard to explain, it's a bad idea.
+ ... If the implementation is easy to explain, it may be a good idea.
+ ... Namespaces are one honking great idea -- let's do more of those!
+ ... """)
+
+Then we can search using the apply method, which takes a search
+string:
+
+ >>> index.apply(u'brown fox')
+ BTrees._IIBTree.IIBucket([(1, 61), (2, 67)])
+
+ >>> index.apply(u'quick fox')
+ BTrees._IIBTree.IIBucket([(1, 61)])
+
+ >>> index.apply(u'brown python')
+ BTrees._IIBTree.IIBucket([])
+
+ >>> index.apply(u'dalmatian')
+ BTrees._IIBTree.IIBucket([])
+
+ >>> index.apply(u'brown or python')
+ BTrees._IIBTree.IIBucket([(1, 26), (2, 25), (8, 9)])
+
+ >>> index.apply(u'butts')
+ BTrees._IIBTree.IIBucket([(7, 69)])
+
+The outputs are mappings from document ids to integer scored. Items
+with higher scores are more relevent.
+
+We can use unicode characters in search strings:
+
+ >>> index.apply(u"Fran\xe7ois")
+ BTrees._IIBTree.IIBucket([(4, 74)])
+
+ >>> index.apply(word)
+ BTrees._IIBTree.IIBucket([(5, 71)])
+
+We can use globbing in search strings:
+
+ >>> index.apply('fo*')
+ BTrees._IIBTree.IIBucket([(1, 217), (2, 265), (3, 204)])
+
+Text indexes support basic statistics:
+
+ >>> index.documentCount()
+ 8
+ >>> index.wordCount()
+ 114
Property changes on: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindex.txt
___________________________________________________________________
Name: svn:eol-style
+ native
Deleted: Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py
===================================================================
--- Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py 2004-12-07 18:23:58 UTC (rev 28579)
+++ Zope3/branches/jim-index-restructure-2004-12/src/zope/index/text/textindexwrapper.py 2004-12-07 22:36:18 UTC (rev 28580)
@@ -1,89 +0,0 @@
-##############################################################################
-#
-# Copyright (c) 2002 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Text index wrapper.
-
-This exists to implement IInjection and IQuerying.
-
-$Id$
-"""
-
-from persistent import Persistent
-from zope.interface import implements
-
-from zope.index.text.okapiindex import OkapiIndex
-from zope.index.text.lexicon import Lexicon
-from zope.index.text.lexicon import Splitter, CaseNormalizer, StopWordRemover
-from zope.index.text.queryparser import QueryParser
-from zope.index.nbest import NBest
-
-from zope.index.interfaces import IInjection, IQuerying, IStatistics
-
-class TextIndexWrapper(Persistent):
-
- implements(IInjection, IQuerying, IStatistics)
-
- def __init__(self, lexicon=None, index=None):
- """Provisional constructor.
-
- This creates the lexicon and index if not passed in."""
- if lexicon is None:
- lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
- if index is None:
- index = OkapiIndex(lexicon)
- self.lexicon = lexicon
- self.index = index
-
- # Methods implementing IInjection
-
- def index_doc(self, docid, text):
- self.index.index_doc(docid, text)
-
- def unindex_doc(self, docid):
- self.index.unindex_doc(docid)
-
- def clear(self):
- self.index.clear()
-
- # Methods implementing IQuerying
-
- def query(self, querytext, start=0, count=None):
- parser = QueryParser(self.lexicon)
- tree = parser.parseQuery(querytext)
- results = tree.executeQuery(self.index)
- if not results:
- return [], 0
- if count is None:
- count = max(0, len(results) - start)
- chooser = NBest(start + count)
- chooser.addmany(results.items())
- batch = chooser.getbest()
- batch = batch[start:]
- if batch:
- qw = self.index.query_weight(tree.terms())
- # Hack to avoid ZeroDivisionError
- if qw == 0:
- qw = batch[0][1] or 1
- qw *= 1.0
- batch = [(docid, score/qw) for docid, score in batch]
- return batch, len(results)
-
- # Methods implementing IStatistics
-
- def documentCount(self):
- """Return the number of documents in the index."""
- return self.index.documentCount()
-
- def wordCount(self):
- """Return the number of words in the index."""
- return self.index.wordCount()
More information about the Zope3-Checkins
mailing list