[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.8 IPipelineElementFactory.py:1.2 Lexicon.py:1.15 PipelineFactory.py:1.2 ZCTextIndex.py:1.21 __init__.py:1.7

Casey Duncan casey@zope.com
Wed, 22 May 2002 15:53:40 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv4487

Modified Files:
	HTMLSplitter.py IPipelineElementFactory.py Lexicon.py 
	PipelineFactory.py ZCTextIndex.py __init__.py 
Log Message:
Enhanced pipeline element factory so that you can group elements that must be
selected in a mutally exclusive manner (such as splitters).

Existing pipeline elements have been grouped appropriately.

Added a stop word remover that does not remove single char words.

Modified ZMI lexicon add form to use pipeline element groups to render form.
Groups with multiple elements are rendered as selects, singletons are rendered
as checkboxes.


=== Products/ZCTextIndex/HTMLSplitter.py 1.7 => 1.8 ===
 
 from Products.ZCTextIndex.ISplitter import ISplitter
-from Products.ZCTextIndex.PipelineFactory import splitter_factory
+from Products.ZCTextIndex.PipelineFactory import element_factory
 
 import re
 
@@ -45,7 +45,9 @@
         return [word for word in text.split()
                 if len(word) > 1 and rx.search(word)]
                 
-splitter_factory.registerFactory('HTML Word Splitter', HTMLWordSplitter)
+element_factory.registerFactory('Word Splitter', 
+                                'HTML aware splitter',
+                                HTMLWordSplitter)
 
 if __name__ == "__main__":
     import sys


=== Products/ZCTextIndex/IPipelineElementFactory.py 1.1 => 1.2 ===
     """Class for creating pipeline elements by name"""
 
-    def registerFactory(name, factory):
-        """Registers a pipeline factory by name.
+    def registerFactory(group, name, factory):
+        """Registers a pipeline factory by name and element group.
         
-        Each name can be registered only once. Duplicate registrations
-        will raise a ValueError
+        Each name can be registered only once for a given group. Duplicate 
+        registrations will raise a ValueError
         """
         
-    def getFactoryNames():
+    def getFactoryGroups():
+        """Returns a sorted list of element group names
+        """
+        
+    def getFactoryNames(group):
         """Returns a sorted list of registered pipeline factory names
+        in the specified element group
         """
         
-    def instantiate(name):
-        """Instantiates a pipeline element by name. If name is not registered
-        raise a KeyError.
+    def instantiate(group, name):
+        """Instantiates a pipeline element by group and name. If name is not 
+        registered raise a KeyError.
         """


=== Products/ZCTextIndex/Lexicon.py 1.14 => 1.15 ===
 from Products.ZCTextIndex.StopDict import get_stopdict
 from Products.ZCTextIndex.ParseTree import QueryError
-from Products.ZCTextIndex.PipelineFactory import \
-     splitter_factory, element_factory
+from Products.ZCTextIndex.PipelineFactory import element_factory
 
 class Lexicon:
 
@@ -169,20 +168,26 @@
             result += self.rxGlob.findall(s)
         return result
         
-splitter_factory.registerFactory('Regex Splitter', Splitter)
+element_factory.registerFactory('Word Splitter', 
+                                 'Whitespace splitter', 
+                                 Splitter)
 
 class CaseNormalizer:
 
     def process(self, lst):
         return [w.lower() for w in lst]
         
-element_factory.registerFactory('Case Normalizer', CaseNormalizer)
+element_factory.registerFactory('Case Normalizer',
+                                'Case Normalizer', 
+                                CaseNormalizer)
+
+element_factory.registerFactory('Stop Words', 
+                                ' Don\'t remove stop words', 
+                                None)
 
 class StopWordRemover:
 
     dict = get_stopdict().copy()
-    for c in range(255):
-        dict[chr(c)] = None
 
     try:
         from Products.ZCTextIndex.stopper import process as _process
@@ -193,6 +198,17 @@
     else:
         def process(self, lst):
             return self._process(self.dict, lst)
+
+element_factory.registerFactory('Stop Words', 
+                                'Remove listed stop words only', 
+                                StopWordRemover)
+
+class StopWordAndSingleCharRemover(StopWordRemover):
+
+    dict = get_stopdict().copy()
+    for c in range(255):
+        dict[chr(c)] = None
             
-            
-element_factory.registerFactory('Stop Word Remover', StopWordRemover)
+element_factory.registerFactory('Stop Words', 
+                                'Remove listed and single char words', 
+                                StopWordAndSingleCharRemover)


=== Products/ZCTextIndex/PipelineFactory.py 1.1 => 1.2 ===
     
     def __init__(self):
-        self._elements = {}
+        self._groups = {}
     
-    def registerFactory(self, name, factory):
-        if self._elements.has_key(name):
-            raise ValueError, 'ZCTextIndex splitter named' + \
-                              '"%s" already registered'
+    def registerFactory(self, group, name, factory):
+        if self._groups.has_key(group) and \
+           self._groups[group].has_key(name):
+            raise ValueError('ZCTextIndex lexicon element "%s" '
+                             'already registered in group "%s"' 
+                             % (name, group))
+                             
+        elements = self._groups.get(group)
+        if elements is None:
+            elements = self._groups[group] = {}
+        elements[name] = factory
         
-        self._elements[name] = factory
+    def getFactoryGroups(self):
+        groups = self._groups.keys()
+        groups.sort()
+        return groups
         
-    def getFactoryNames(self):
-        names = self._elements.keys()
+    def getFactoryNames(self, group):
+        names = self._groups[group].keys()
         names.sort()
         return names
         
-    def instantiate(self, name):
-        return self._elements[name]()
-        
-
-splitter_factory = PipelineElementFactory()
+    def instantiate(self, group, name):
+        factory = self._groups[group][name]
+        if factory is not None:
+            return factory()
 
 element_factory = PipelineElementFactory()


=== Products/ZCTextIndex/ZCTextIndex.py 1.20 => 1.21 ===
 from Products.ZCTextIndex.NBest import NBest
 from Products.ZCTextIndex.QueryParser import QueryParser
-from PipelineFactory import splitter_factory, element_factory
+from PipelineFactory import element_factory
 
 from Products.ZCTextIndex.CosineIndex import CosineIndex
 from Products.ZCTextIndex.OkapiIndex import OkapiIndex
@@ -174,16 +174,23 @@
 
 manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
 
-def manage_addLexicon(self, id, title='', splitter_name=None, 
-                      element_names=None, REQUEST=None):
+def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
     """Add ZCTextIndex Lexicon"""
     
-    elements = [element_factory.instantiate(name) for name in element_names]
-    
-    if splitter_name:
-        elements.insert(0, splitter_factory.instantiate(splitter_name))
+    pipeline = []
+    for el_record in elements:
+        if not hasattr(el_record, 'name'): 
+            continue # Skip over records that only specify element group
+        element = element_factory.instantiate(el_record.group, el_record.name)
+        if element is not None:
+            if el_record.group == 'Word Splitter':
+                # I don't like hardcoding this, but its a simple solution
+                # to get the splitter element first in the pipeline
+                pipeline.insert(0, element)
+            else:
+                pipeline.append(element)
 
-    lexicon = PLexicon(id, title, *elements)
+    lexicon = PLexicon(id, title, *pipeline)
     self._setObject(id, lexicon)
     if REQUEST is not None:
         return self.manage_main(self, REQUEST, update_menu=1)


=== Products/ZCTextIndex/__init__.py 1.6 => 1.7 ===
 """
 
-from PipelineFactory import splitter_factory, element_factory
+from PipelineFactory import element_factory
 from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
 
 def initialize(context):
@@ -36,17 +36,17 @@
         permission = 'Add Vocabularies',
         constructors = (ZCTextIndex.manage_addLexiconForm,
                         ZCTextIndex.manage_addLexicon,
-                        getSplitterNames, getElementNames),
+                        getElementGroups, getElementNames),
         icon='www/lexicon.gif'
     )
     
 ## Functions below are for use in the ZMI constructor forms ##
     
-def getSplitterNames(self):
-    return splitter_factory.getFactoryNames()
+def getElementGroups(self):
+    return element_factory.getFactoryGroups()
     
-def getElementNames(self):
-    return element_factory.getFactoryNames()
+def getElementNames(self, group):
+    return element_factory.getFactoryNames(group)
     
 def getIndexTypes(self):
     return ZCTextIndex.index_types.keys()