[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - ResultListNG.py:1.1 TextIndexCommon.py:1.1 TextIndexNG.py:1.1 TextOperators.py:1.1 __init__.py:1.1 test.py:1.1

Andreas Jung andreas@zope.com
Fri, 4 Jan 2002 11:38:16 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv28876/TextIndexNG

Added Files:
	ResultListNG.py TextIndexCommon.py TextIndexNG.py 
	TextOperators.py __init__.py test.py 
Log Message:
- added prototype for new TextIndex to support NEAR search and
  stemmer support for eleven languages.
- TextIndex code cleanup
- refactoring of the existing TextIndex code


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/ResultListNG.py ===
#############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

from BTrees.IIBTree import IIBucket,IISet
from BTrees.OOBTree import OOSet
from BTrees.IIBTree import weightedIntersection,  difference
from BTrees.IIBTree import union as Iunion
from BTrees.IIBTree import intersection as Iintersection
from BTrees.OOBTree import union as Ounion

from types import TupleType
from TextIndexCommon import debug


class ResultListNG:
    """ class to keep results for TextIndexNG queries """
  
    def __init__(self, d, words, index):

        # usually instance of TextIndexNG
        self._index = index

        # words is either an OOSet or a mapping
        if type(words) is not OOSet: words=OOSet(words)
        self._words = words

        self._dict = d        

        self.keys    = self._dict.keys
        self.values  = self._dict.values
        self.items   = self._dict.items


    def Intersection(self,d1,d2):   
        """ Intersection between the documentIds (keys) of two dictionaries.
            The list of positions are merged.
        """

        r = {}

        docIds = Iintersection(
                    IISet(d1.keys()),
                    IISet(d2.keys())
                    )
            
        for docId in docIds: 
            r[docId] = Iunion( d1[docId], d2[docId])

        return r


    def Union(self,d1,d2):
        """ Union of the documentIds (keys) of two dictionaries.
            The list of positions are merged.
        """

        r = d1.copy()
            
        for docId in d2.keys():

            if d1.has_key(docId):
                r[docId] = Iunion( d1[docId], d2[docId])
            else:
                r[docId] = d2[docId]

        return r
            
            

    def __and__(self, x):
        """ and """

        return self.__class__(
            self.Intersection(self._dict, x._dict),
            Ounion(self._words, x._words),
            self._index,
            )


    def and_not(self, x):
        """ and not  """

        return self.__class__(
            difference(self._dict, x._dict),
            self._words,
            self._index,
            )
  
    def __or__(self, x):
        """ or """

        return self.__class__(
            self.Union(self._dict, x._dict),
            Ounion(self._words, x._words),
            self._index,
            )


    def near(self, x):
        """ near search """

        debug('-'*78)
        debug('entering near:')
        debug(self._words)
        debug(x._words)


        result = IIBucket()

        dict  = self._dict
        xdict = x._dict

        positions = self._index.positions

        debug("applying near search for documents:")
        debug("\t",dict)
        debug("\t",xdict)

        # inters is an IISet() with documentIds.
        
        inters = self.Intersection(dict, xdict)

        debug("Intersection is:")
        debug('\t',inters)

        for docId in inters.keys():

            debug('searching for positions',docId,self._words)
            p1 = positions(docId, self._words)
           
            debug('searching for positions',docId,x._words)
            p2 = positions(docId, x._words)

            leftPositions = IISet() 
            for set in p1.values():
                leftPositions = Iunion(leftPositions,set)

            rightPositions = IISet() 
            for set in p2.values():
                rightPositions = Iunion(rightPositions,set)


            for pl in leftPositions:
                for pr in rightPositions:
                    diff = abs(pl - pr)
                
                    if diff < 4:
                        debug('difference for (%d,%d): %d' % (pl,pr,diff))
                        result[docId] = 0


        return self.__class__(
            result, Ounion(self._words, x._words), self._index)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexCommon.py ===
##############################################################################
# 
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
#############################################################################


import re
from TextOperators import operator_dict, Near
from types import ListType

_debug = 1


def parse(s):
    """Parse parentheses and quotes"""

    l = []
    tmp = s.lower()

    p = parens(tmp)

    while p is not None:

        # Look for quotes in the section of the string before
        # the parentheses, then parse the string inside the parens

        l = l + quotes(p[0])
        l.append(parse(p[1]))

        # continue looking through the rest of the string

        tmp = p[2]
        p = parens(tmp)

    return l + quotes(tmp)



def parse2(q, default_operator, operator_dict=operator_dict):
    """Find operators and operands"""

    isop = operator_dict.has_key
    i = 0

    while i < len(q):
        e = q[i]

        if isinstance(e, ListType):
            q[i] = parse2(e, default_operator)
            if i % 2:
                q.insert(i, default_operator)
                i+=1 

        elif i % 2:

            # This element should be an operator

            if isop(e):

                # Ensure that it is identical, not merely equal.
                q[i] = operator_dict[e]

            else:

                # Insert the default operator.
                q.insert(i, default_operator)
                i+=1 

        i+=1 

    return q


def parens(s, parens_re=re.compile('[()]').search):

    mo = parens_re(s)
    if mo is None: return
    
    open_index = mo.start(0) + 1
    paren_count = 0

    while mo is not None:
        index = mo.start(0)
    
        if s[index] == '(':
            paren_count = paren_count + 1

        else:
            paren_count = paren_count - 1
            if paren_count == 0:
                return (s[:open_index - 1], s[open_index:index],
                        s[index + 1:])
            if paren_count < 0:
                break
        mo = parens_re(s, index + 1)

    raise QueryError, "Mismatched parentheses"      


def quotes(s):

    if '"' not in s: return s.split()
    
    # split up quoted regions
    splitted = re.split('\s*\"\s*', s)

    if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
    
    for i in range(1,len(splitted),2):

        # split the quoted region into words
        words = splitted[i] = splitted[i].split()
        
        # put the Proxmity operator in between quoted words
        j = len(words) - 1
        while j > 0:
            words.insert(j, Near)
            j = j - 1

    i = len(splitted) - 1
    while i >= 0:
        # split the non-quoted region into words
        splitted[i:i+1] = splitted[i].split()
        i = i - 2

    return filter(None, splitted)


def debug(*args):
    """ used by TextIndexNG for dev. purposes """

    import sys

    if _debug:

        for a in args:
            sys.stdout.write(str(a))

        sys.stdout.write('\n')
        sys.stdout.flush()


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

"""
Text Index
"""

__version__ = '$Revision: 1.1 $'[11:-2]

import  re
import warnings
from Globals import Persistent,DTMLFile
from zLOG import LOG, ERROR
from OFS.SimpleItem import SimpleItem
from Acquisition import Implicit
from Products.PluginIndexes.TextIndexNG.ResultListNG import ResultListNG
from Products.PluginIndexes import PluggableIndex       
from Products.PluginIndexes.common.util import parseIndexRequest

from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.OOBTree import OOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IISet, IITreeSet
from BTrees.IIBTree import difference, weightedIntersection

from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
from Products.PluginIndexes.TextIndex.GlobbingLexicon import GlobbingLexicon

from Products.PluginIndexes.TextIndex import Splitter

import Stemmer

from types import IntType, StringType, UnicodeType
from TextOperators import *
from TextIndexCommon import *


class TextIndexNG(PluggableIndex.PluggableIndex, Persistent,
     Implicit, SimpleItem):

    __implements__ = (PluggableIndex.PluggableIndexInterface,)

    meta_type='TextIndexNG'

    manage_options= (
        {'label': 'Settings',     
         'action': 'manage_main',
         'help': ('TextIndex','TextIndex_Settings.stx')},
    )

    query_options = ["query","operator"]
 
    def __init__(self 
                 , id 
                 , extra= None
                 , caller = None
                ):

        self.id            = id

        self.useSplitter   = getattr(extra,'useSplitter',   'ZopeSplitter')
        self.useStemmer    = getattr(extra,'useStemmer',    None)
        self.useOperator   = getattr(extra,'useOperator',   'and')
        self.useGlobbing   = getattr(extra,'useGlobbing',   1)
        self.lexicon       = getattr(extra,'lexicon',       None)
        self.useNearSearch = getattr(extra,'useNearSearch', 1)
        self.nearDistance  = getattr(extra,'nearDistance',  5)

        if self.lexicon== 'None':     self.lexicon    = None
        if self.useStemmer == 'None': self.useStemmer = None

        self.clear()
                        

    def clear(self):

        self._IDX     = IOBTree()

        # get splitter function
        self._splitterfunc = self._stemmerfunc = None

        if self.useSplitter:
            self._splitterfunc = Splitter.getSplitter(self.useSplitter)

        # stemmer function
        if self.useStemmer:
            self._stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem

        if self.lexicon:
            self._LEXICON = self.lexicon
        else:
            if self.useGlobbing:
                self._LEXICON = GlobbingLexicon()
                debug('created new globbing lexicon')
                if self._stemmerfunc:
                    debug('stemming disabled because globbing enabled')
                    self._stemmerfunc = None

            else:
                self._LEXICON = Lexicon()
                debug('created new lexicon')


        self._v_getWordId      = self._LEXICON.getWordId
        self._v_getWordById    = self._LEXICON.getWord
        self._v_getIdByWord    = self._LEXICON.get

    def __nonzero__(self):
        return not not self._unindex
    

    def insertForwardEntry(self,wordId,pos,documentId):

        # self._IDX is a mapping:
        # wordId -> documentId -> [positions]
        
        idx = self._IDX

        if idx.has_key(wordId) == 0:
            idx[wordId] = IOBTree()

        tree = idx[wordId] 
        if tree.has_key(documentId) == 0:
            tree[documentId] = IISet() 

        tree[documentId].insert(pos)



    def _printIndex(self):

        for wordId in self._IDX.keys():
            print '-'*78
            print wordId,self._v_getWordById(wordId),
            print 
            for k,v in self._IDX[wordId].items():
                print k,v

    def index_object(self, documentId, obj, threshold=None):

        try:
            source = getattr(obj, self.id)
            if callable(source): source = str(source())
            else:                source = str(source)
        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'
        
        try:
            encoding = getattr(obj, self.id+'_encoding')
            if callable(encoding ):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'


        # Split the text into a list of words
        words = self._splitterfunc(source,encoding=encoding)
    
        for i in range(len(words)):
            word = words[i]

            # stem the single word        
            if self._stemmerfunc:
                word = self._stemmerfunc(word)

            # get (new) wordId for word
            wid = self._v_getWordId(word)

            # and insert the wordId, its position and the documentId 
            # in the index
            self.insertForwardEntry(wid,i,documentId)


    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"

        Note that this differentiates between being passed an Integer
        and a String.  Strings are looked up in the lexicon, whereas
        Integers are assumed to be resolved word ids. """

        if isinstance(word, IntType):

            # We have a word ID

            r = {}

            res = self._IDX.get(word, None)

            for docId in  res.keys():
                r[docId] = self._IDX[word][docId]

            return ResultListNG(r , (word,), self)

        else:

            # We need to stem

            if self._stemmerfunc:
                word = self._stemmerfunc(word)

            wids = self._v_getIdByWord(word)

            if wids:
                r = {}

                res  = self._IDX.get(wids[0], None)
                for docId in  res.keys():
                    r[docId] = self._IDX[wids[0]][docId]

            else:
                r={}

            return ResultListNG(r, (word,), self)


    def getLexicon(self):
        return self._LEXICON


    def _apply_index(self, request, cid=''): 
        """ Apply the index to query parameters given in the argument,
        request

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.
 
        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.  
        """

        record = parseIndexRequest(request,self.id,self.query_options)
        if record.keys==None: return None

        # Changed for 2.4
        # We use the default operator that can me managed via the ZMI

        qop = record.get('operator', self.useOperator)
        query_operator = operator_dict.get(qop)

        if query_operator is None:
            raise exceptions.RuntimeError, ("Invalid operator '%s' "
                                            "for a TextIndex" % qop)
        r = None

        for key in record.keys:
            key = key.strip()
            if not key:
                continue

            b = self.query(key, query_operator)
            w, r = weightedIntersection(r, b)

        if r is not None:
            return r, (self.id,)
        
        return (IIBucket(), (self.id,))


    def positions(self,docId, words):
        """ search all positions for a list of words for
            a given document given by its documentId.
            positions() returns a mapping word to
            list of positions of the word inside the document.
        """
      
        debug('searching positions docid: %s, words: %s' % (docId,words))

        res = OOBTree()

        for w in words:

            if isinstance(w,IntType):
                wid = w
                word = self._v_getWordById(w) 
            else:
                wid = self._v_getIdByWord(w)[0]
                word = w

            if self._IDX[wid].has_key(docId):
                res[w] = self._IDX[wid][docId]


        for k,v in res.items():
            debug(k,v)

        return res


    def query(self, s, default_operator=Or):
        """ Evaluate a query string.
        
        Convert the query string into a data structure of nested lists
        and strings, based on the grouping of whitespace-separated
        strings by parentheses and quotes.  The 'Near' operator is
        inserted between the strings of a quoted group.

        The Lexicon is given the opportunity to transform the
        data structure.  Stemming, wildcards, and translation are
        possible Lexicon services.

        Finally, the query list is normalized so that it and every
        sub-list consist of non-operator strings or lists separated
        by operators. This list is evaluated.
        """

        # First replace any occurences of " and not " with " andnot "
        s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)

        # Parse parentheses and quotes
        q = parse(s)

        # Allow the Lexicon to process the query
        q = self.getLexicon().query_hook(q)

        # Insert the default operator between any two search terms not
        # already joined by an operator.
        q = parse2(q, default_operator)
        debug('before eval',q)

        # evalute the final 'expression'
        return self.evaluate(q)


    def get_operands(self, q, i):

        """Evaluate and return the left and right operands for an operator"""

        try:
            left  = q[i - 1]
            right = q[i + 1]
        except IndexError:
            raise QueryError, "Malformed query"

        if isinstance(left, IntType):
            left = self[left]
        elif isinstance(left, StringType) or isinstance(left,UnicodeType):
            left = self[left]        
        elif isinstance(left, ListType):
            left = self.evaluate(left)

        if isinstance(right, IntType):
            right = self[right]
        elif isinstance(right, StringType) or isinstance(right,UnicodeType):
            right = self[right]       
        elif isinstance(right, ListType):
            right = self.evaluate(right)

        return (left, right)



    def evaluate(self, query):
        """Evaluate a parsed query"""

        # Strip off meaningless layers
        while isinstance(query, ListType) and len(query) == 1:
            query = query[0]

        # If it's not a list, assume a string or number
        if not isinstance(query, ListType):
            return self[query]

        # Now we need to loop through the query and reduce
        # operators.  They are currently evaluated in the following
        # order: AndNot -> And -> Or -> Near
        i = 0
        while (i < len(query)):
            if query[i] is AndNot:
                left, right = self.get_operands(query, i)
                val = left.and_not(right)
                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
        while (i < len(query)):
            if query[i] is And:
                left, right = self.get_operands(query, i)
                val = left & right
                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
        while (i < len(query)):
            if query[i] is Or:
                left, right = self.get_operands(query, i)
                val = left | right
                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
        while (i < len(query)):
            if query[i] is Near:
                left, right = self.get_operands(query, i)
                val = left.near(right)
                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        if (len(query) != 1):
            raise QueryError, "Malformed query"

        return query[0]


    def numObjects(self):
        """ return number of index objects """
        return len(self._IDX)


    def manage_setPreferences(self,extra,
                               REQUEST=None,RESPONSE=None,URL2=None):
        """ preferences of TextIndex """

        pass

        if RESPONSE:
            RESPONSE.redirect(URL2 + '/manage_main?manage_tabs_message=Preferences%20saved')


    manage_workspace  = DTMLFile("dtml/manageTextIndexNG",globals())


manage_addTextIndexNGForm = DTMLFile('dtml/addTextIndexNG', globals())

def manage_addTextIndexNG(self, id, extra, REQUEST=None, RESPONSE=None, URL3=None):
    """Add a new TextIndexNG """
    return self.manage_addIndex(id, 'TextIndexNG', extra, REQUEST, RESPONSE, URL3)



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextOperators.py ===
##############################################################################
# 
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
#############################################################################


class Op:
    """ TextIndex operator class """

    def __init__(self, name):
        self.name = name

    def __repr__(self):
        return self.name
    __str__ = __repr__



AndNot      = Op('andnot')
And         = Op('and')
Or          = Op('or')
Near        = Op('...')

operator_dict = {
    'andnot': AndNot, 
    'and':    And, 
    'or':     Or,
    '...':    Near, 
    'near':   Near,
    AndNot:   AndNot, 
    And:      And, 
    Or:       Or, 
    Near:     Near
}

QueryError    = 'TextIndex.QueryError'


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/__init__.py ===


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/test.py ===
#!/usr/bin/env python2.1

from Products.PluginIndexes.TextIndexNG import TextIndexNG
import os, sys, re,traceback, atexit
import readline

histfile = os.path.expanduser('~/.pyhist')
try:
    readline.read_history_file(histfile)
except IOError: pass
atexit.register(readline.write_history_file,histfile)

datadir = '/work/html//doc/python-2.2/lib'
datadir = '/work/html//doc/python-2.2/ext'

class extra: pass


class TO:
    
    def __init__(self,txt):
        self.text = txt


ex = extra()
ex.useSplitter='ZopeSplitter'
ex.useStemmer='porter'
ex.useOperator='and'
ex.lexicon = None
ex.useGlobbing=1


TI = TextIndexNG.TextIndexNG('text',ex)
t1 = TO ('this text is a suxing text')
t2 = TO ('the suxing quick brown fox jumps over the lazy dog because the dog is quick and jumps quick') 
TI.index_object(-1,t1)
TI.index_object(-2,t2)

files = os.listdir(datadir)
files.sort()

for i in range(len(files)):
    f = files[i]
    print f
    fname = os.path.join(datadir,f)
    data = open(fname).read()

    T = TO(data)
    TI.index_object(i,T)



#TI.newTree()
#print 
#print TI._apply_index({'text':{'query':'suxing'}})
#print 
#print TI._apply_index({'text':{'query':'blabla'}})
#print 
#print TI._apply_index({'text':{'query':'suxing and quick'}})
##print TI._apply_index({'text':{'query':'(wurm dog and cat) or dummnase'}})
##print TI._apply_index({'text':{'query':'("wurm dog blabla the" and cat) or dummnase'}})
#print 
#x = TI._apply_index({'text':{'query':'dog and lazy'}})
#print x


while 1:

    line = raw_input("> ")
    
    
    try:
        nums,dummy = TI._apply_index({'text':{'query':line}})

        print "Result:"
        
        for k,v in nums.items():
            print k,files[k]

        print 

    except:
        traceback.print_exc()