[Zope-CVS] CVS: Products/ZCTextIndex/tests - indexhtml.py:1.7
Jeremy Hylton
jeremy@zope.com
Fri, 17 May 2002 14:56:01 -0400
Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv16359
Modified Files:
indexhtml.py
Log Message:
Add a little splitter that behaves pretty much like HTMLWordSplitter,
but works with a TextIndex Lexicon.
=== Products/ZCTextIndex/tests/indexhtml.py 1.6 => 1.7 ===
-
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
@@ -9,6 +8,7 @@
options:
-f data.fs -- the path to the filestorage datafile
"""
+from __future__ import nested_scopes
import os
from time import clock
@@ -32,12 +32,28 @@
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex("read", extra, caller)
+# XXX make a splitter more like the HTMLSplitter for TextIndex
+# signature is
+# Splitter(string, stop_words, encoding,
+# singlechar, indexnumbers, casefolding)
+
+class MySplitter:
+ def __init__(self):
+ self._v_splitter = HTMLWordSplitter()
+ def __call__(self, text, stopdict, *args, **kwargs):
+ words = self._v_splitter._split(text)
+ def lookup(w):
+ return stopdict.get(w, w)
+ return filter(None, map(lookup, words))
+
def make_old_index():
from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
from Products.PluginIndexes.TextIndex.Lexicon \
import Lexicon, stop_word_dict
- return TextIndex("read", lexicon=Lexicon(stop_word_dict))
+ l = Lexicon(stop_word_dict)
+ l.SplitterFunc = MySplitter()
+ return TextIndex("read", lexicon=l)
def main(db, root, dir):
rt["index"] = index = INDEX()