[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser - TextIndexGgen.py:1.1.2.1 kjParseBuild.py:1.1.2.1 kjParser.py:1.1.2.1 kjSet.py:1.1.2.1 Makefile:NONE

Andreas Jung andreas@digicool.com
Mon, 14 Jan 2002 15:00:41 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser
In directory cvs.zope.org:/tmp/cvs-serv19985

Added Files:
      Tag: ajung-textindexng-branch
	TextIndexGgen.py kjParseBuild.py kjParser.py kjSet.py 
Removed Files:
      Tag: ajung-textindexng-branch
	Makefile 
Log Message:
added new parser using kwParsing


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/TextIndexGgen.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

""" Grammer for TextIndex queries """


REGENERATEONLOAD = 1

import string
GRAMMARSTRING ="""
       Value ::  ## indicates Value is the root nonterminal for the grammar
    @R BaseRule            :: Value    >> Term 
    @R Braces1Rule         :: Term     >> openp Term closep
    @R Braces2Rule         :: Term     >> openp Term closep Term
    @R DoubleTermRule      :: Term     >> Term  Term
    @R AndTermRule         :: Term     >> Term and Term
    @R OrTermRule          :: Term     >> Term or Term
    @R Strrule             :: Term     >> str


"""
COMPILEDFILENAME   = "TextIndexG.py"
MARSHALLEDFILENAME = "TextIndexG.mar"
OPENPREGEX         = r'('
CLOSEPREGEX        = r')'
ANDREGEX           = 'and'
ORREGEX            = 'or'
STRREGEX           = '[a-zA-Z]*'

### declare interpretation functions and regex's for terminals


class Stack:

    def __init__(self):
        self.lst = []

    def push(self,item):
        self.lst.append(item)

    def pop(self):
        item = self.lst[-1]
        self.lst = self.lst[:-1]
        return item


class Collector(Stack):

    def __init__(self, default_op='and'):
        Stack.__init__(self)

        self._current    = []
        self._op         = []
        self._default_op = default_op
        self._result     = ''


    def addWord(self,word):
        self._current.append( 'LL("%s")' % word)

    def setOp(self, op):
        self._op = op


    def newLevel(self):

        # pop current on the stack
        self.push( (self._current, self._op))

        self._current = []


    def endLevel(self):

        if not self._op: 
            op = self._default_op
        else:
            op = self._op

        if self._current:
            r = ','.join(self._current)
            r = "%s(%s)" % (op, r)

            # get top element from stack
            topEl,topOp = self.pop()
            topEl.append( r )
            self.push( (topEl, topOp))

            self._current = []

        else:
            # merge two top level element of stack
            topEl,topOp = self.pop()
            top1El,top1Op = self.pop()

            r = ','.join(topEl)
            r = "%s(%s)" % (topOp, r)

            top1El.append(r)

            self.push( (top1El, top1Op) )

    def __str__(self):
        return "Collector: %s %s" % (self._current,self._op)
    
    __repr__ = __str__

    def getResult(self):
   
        if self._current:
            r = ','.join(self._current)
            r = "%s(%s)" % (self._op, r)

            return r

        else:

            topEl,topOp = self.pop()
            r = ','.join(topEl)
            r = "%s(%s)" % (topOp, r)
                
            return r   


def openParens(x):
    C.newLevel()

def closeParens(x):
    C.endLevel()


def wordFound(word):
    C.addWord( word )
    return word

def operandToken(op):
    C.setOp(op)


def DeclareTerminals(Grammar):
    Grammar.Addterm("and",    ANDREGEX,     operandToken)
    Grammar.Addterm("or",     ORREGEX,      operandToken)
    Grammar.Addterm("openp",  OPENPREGEX,   openParens)
    Grammar.Addterm("closep", CLOSEPREGEX,  closeParens)
    Grammar.Addterm("str",    STRREGEX,     wordFound)


def BindRules(Grammar):
     pass

# This function generates the grammar and dumps it to a file.  
def GrammarBuild():
    import kjParseBuild

    TextIndexG = kjParseBuild.NullCGrammar()
    TextIndexG.SetCaseSensitivity(0) # grammar is not case sensitive for keywords
    DeclareTerminals(TextIndexG)
#    TextIndexG.Keywords("and or")
#    TextIndexG.punct("()")
    TextIndexG.Nonterms("Value Term")
    TextIndexG.Declarerules(GRAMMARSTRING)
    TextIndexG.Compile()

    print "dumping as python to "+COMPILEDFILENAME
    outfile = open(COMPILEDFILENAME, "w")
    TextIndexG.Reconstruct("TextIndexG",outfile,"GRAMMAR")
    outfile.close()

    print "dumping as binary to "+MARSHALLEDFILENAME
    outfile = open(MARSHALLEDFILENAME, "w")
    TextIndexG.MarshalDump(outfile)
    outfile.close()

    BindRules(TextIndexG)
    return TextIndexG


# this function initializes the compiled grammar from the generated file.  
def LoadTextIndexG():
    import TextIndexG
    # reload to make sure we get the most recent version!
    # (only needed when debugging the grammar).
    reload(TextIndexG)
    TextIndexG = TextIndexG.GRAMMAR()
    DeclareTerminals(TextIndexG)
    BindRules(TextIndexG)
    return TextIndexG

def unMarshalTextIndexG():
    import kjParser
    infile = open(MARSHALLEDFILENAME, "r")
    TextIndexG = kjParser.UnMarshalGram(infile)
    infile.close()
    DeclareTerminals(TextIndexG)
    BindRules(TextIndexG)
    return TextIndexG


########## test the grammar generation
if REGENERATEONLOAD:
   print "(re)generating the TextIndexG grammar in file TextIndexG.py"
   Dummy = GrammarBuild()
   print "(re)generation done."


if __name__ == '__main__':

    print "loading grammar as python"
    TextIndexG = LoadTextIndexG()

    Context = { }

    for item in ['a','(a)','(a) (b)','(a  b)','(a b)(c or d) ' ,
            '(a and b) or (c and dd)','a and (b or c or (x and y))']:

        print '-'*78
        C = Collector() 
        print item    
        test = TextIndexG.DoParse1( item, Context)

        print 'res:',C.getResult()



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/kjParseBuild.py === (1229/1329 lines abridged)
#
# python code for building a parser from a grammar
#  Copyright Aaron Robert Watters, 1994
#

# BUGS:
#  A bad grammar that has no derivations for
#  the root nonterminal may cause a name error
#  on the variable "GoodStartingPlace"

# this needs to be modified so the RULEGRAM is loaded from a
# compiled representation if available.

import string
import kjSet
import kjParser
import regex

# import some constants
from kjParser import \
 TERMFLAG, NOMATCHFLAG, MOVETOFLAG, REDUCEFLAG, TRANSFLAG, KEYFLAG, \
 NONTERMFLAG, TERMFLAG, EOFFLAG, ENDOFFILETOKEN

PMODULE = kjParser.THISMODULE

# errors raised here
TokenError = "TokenError" # may happen on autogen with bad grammar
NotSLRError = "NotSLRError" # may happen for nonSLR grammar

# set this flag for regression testing at each load
RUNTESTS = 0
# set this flag to abort automatic generation on Errors
ABORTONERROR = 0

# token used to mark null productions
NULLTOKEN = (None,None)


# a derived FSM class, with closure computation methods defined
# (compilable FSMachine)
#
class CFSMachine(kjParser.FSMachine):

  def __init__(self, nonterm):
      kjParser.FSMachine.__init__(self, nonterm)

  # return the epsilon closure of the FSM as a new FSM
  #
  # DoNullMap, if set, will map unexpected tokens to
  # the "empty" state (usually creating a really big fsm)

[-=- -=- -=- 1229 lines omitted -=- -=- -=-]

  RX = kjParser.ParseRule( X, [ oppar, Y, clpar ] )
  RY = kjParser.ParseRule( Y, [] )
  rl2 = [RX,RY]
  rs2 = ruleset(X, rl2)
  rs2.CompFirst()
  rs2.CompFollow()
  rs2.CompSLRNFA()
  rs2.CompDFA()
  rs2.SLRFixDFA()
  DFA2 = rs2.DFA
  
  ttt2 = dummy()
  def TESTDFA2( STRING, DOREDUCTIONS = 1):
     return TESTDFA( STRING, ttt2, DFA2, rl2, DOREDUCTIONS )
  
  # the following grammar should fail to be slr
  # (Aho,Ullman p. 213)
  
  S = kjParser.nonterminal("S")
  L = kjParser.nonterminal("L")
  R = kjParser.nonterminal("R")
  RS1 = kjParser.ParseRule( S, [L, equals, R] )
  RS2 = kjParser.ParseRule( S, [R], echo )
  RL1 = kjParser.ParseRule( L, [star, R])
  RL2 = kjParser.ParseRule( L, [id])
  RR1 = kjParser.ParseRule( R, [L] )
  rs3 = ruleset(S, [RS1,RS2,RL1,RL2,RR1])
  rs3.CompFirst()
  rs3.CompFollow()
  rs3.CompSLRNFA()
  rs3.CompDFA()
  #rs3.SLRFixDFA() # should fail and does.

  # testing RULEGRAM
  ObjG = NullCGrammar()
  ObjG.Addterm("id","id",echo)
  ObjG.Nonterms("T E Ep F Tp")
  ObjG.Keywords("begin end")
  ObjG.punct("+*()")
  ObjG.comments(["--.*\n"])
  # PROBLEM WITH COMMENTS???
  Rulestr = """
    ## what a silly grammar!
    T ::
    @R One :: T >> begin E end
    @R Three :: E >>
    @R Two :: E >> E + T
    @R Four :: E >> ( T )
    """
  RL = RULEGRAM.DoParse1( Rulestr, ObjG )


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/kjParser.py === (1200/1300 lines abridged)
#
# python for parser interpretation
#  Copyright Aaron Robert Watters, 1994
#

# BUGS:
# Lexical error handling is not nice
# Parse error handling is not nice
#
# Lex analysis may be slow for big grammars
# Setting case sensitivity for keywords MUST happen BEFORE
#   declaration of keywords.

import kjSet
import string
import regex
import regsub
import string

# set this flag for regression testing at each load
RUNTESTS = 0

# set this flag to enable warning for default reductions
WARNONDEFAULTS = 0

# some local constants
TERMFLAG = -1 # FLAG FOR TERMINAL
NOMATCHFLAG = -2 # FLAG FOR NO MATCH IN FSM
MOVETOFLAG = -3 # FLAG FOR "SHIFT" IN SN FSM
REDUCEFLAG = -4 # FLAG FOR REDUCTION IN FSM
TRANSFLAG = -5 # FLAG FOR TRANSIENT STATE IN FSM
KEYFLAG = -6 # FLAG FOR KEYWORD
NONTERMFLAG = -7 # FLAG FOR NONTERMINAL
TERMFLAG = -8 # FLAG FOR TERMINAL
EOFFLAG = "*" # FLAG for End of file

# set this string to the Module name (filename)
# used for dumping reconstructable objects
THISMODULE = "kjParser"

# regular expression for matching whitespace
WHITERE = "["+string.whitespace+"]+"
WHITEREGEX = regex.compile(WHITERE)

# local errors
LexTokenError = "LexTokenError" # may happen on bad string
UnkTermError = "UnkTermError" # ditto
BadPunctError= "BadPunctError" # if try to make whitespace a punct
ParseInitError = "ParseInitError" # shouldn't happen?
#EOFError # may happen on bad string

[-=- -=- -=- 1200 lines omitted -=- -=- -=-]

      for tokenindex in range(len(tokens)):
          (kind,name) = tokens[tokenindex]
          if kind == KEYFLAG:
             tokens[tokenindex] = LexD.keyword(name)
          elif not kind in [TERMFLAG, NONTERMFLAG]:
             raise FlowError, "unknown token type"
      # not needed
      self.tokens = tokens

   def MakeRules(self):
      Grammar = self.Gram
      Grammar.DFA.root_nonTerminal = self.Root
      NameIndex = Grammar.RuleNameToIndex
      RuleTuples = self.RuleTups
      nRules = len(RuleTuples)
      RuleList = [None] * nRules
      for index in range(nRules):
         (Name, Components) = RuleTuples[index]
         rule = apply(ParseRule, Components)
         rule.Name = Name
         RuleList[index] = rule
         NameIndex[Name] = index
      Grammar.RuleL = RuleList

   def MakeTransitions(self):
      Grammar = self.Gram
      DFA = Grammar.DFA
      StateTokenMap = DFA.StateTokenMap
      tokens = self.tokens
      # record the state number
      DFA.maxState = self.MaxStates
      # this is historical, unfortunately...  CLEAN IT UP SOMEDAY!
      # THE DFA.States DICT IS NOT NEEDED (?) (here)
      for state in range(1, self.MaxStates+1):
         DFA.States[state] = [TRANSFLAG]
      # record the reductions
      for (fromState, TokenIndex, rulenum) in self.reducts:
         DFA.SetReduction(fromState, tokens[TokenIndex], rulenum)
      # record the transitions
      for (fromState, TokenIndex, ToState) in self.moveTos:
         DFA.SetMap(fromState, tokens[TokenIndex], ToState)

   def Cleanup(self):
      Grammar = self.Gram
      Grammar.CleanUp()

################# FOLLOWING CODE IS FOR REGRESSION TESTING ONLY
################# DELETE IT IF YOU WANT/NEED
#### All tests for this module deleted, since
#### ParseBuild module tests are sufficient.  


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/kjSet.py ===

#
# sets implemented using mappings
#  Copyright Aaron Robert Watters, 1994
#
# these only work for "immutable" elements.
# probably not terribly efficient, but easy to implement
# and not as slow as concievably possible.

def NewSet(Sequence):
    Result = {}
    for Elt in Sequence:
        Result[Elt] = 1
    return Result

def Empty(Set):
    if Set == {}:
       return 1
    else:
       return 0

def get_elts(Set):
    return Set.keys()

def member(Elt,Set):
    return Set.has_key(Elt)

# in place mutators:
# returns if no change otherwise 1

def addMember(Elt,Set):
    change = 0
    if not Set.has_key(Elt):
       Set[Elt] = 1
       change = 1
    return change

def Augment(Set, OtherSet):
    change = 0
    for Elt in OtherSet.keys():
        if not Set.has_key(Elt):
           Set[Elt] = 1
           change = 1
    return change


def Mask(Set, OtherSet):
    change = 0
    for Elt in OtherSet.keys():
        if Set.has_key(Elt):
           del Set[Elt]
           change = 1
    return change

# side effect free functions

def Intersection(Set1, Set2):
    Result = {}
    for Elt in Set1.keys():
        if Set2.has_key(Elt):
           Result[Elt] = 1
    return Result

def Difference(Set1, Set2):
    Result = {}
    for Elt in Set1.keys():
        if not Set2.has_key(Elt):
           Result[Elt] = 1
    return Result

def Union(Set1,Set2):
    Result = {}
    Augment(Result,Set1)
    Augment(Result,Set2)
    return Result

def Subset(Set1,Set2):
    Result = 1
    for Elt in Set1.keys():
        if not Set2.has_key(Elt):
           Result = 0
           return Result # nonlocal
    return Result

def Same(Set1,Set2):
    if Subset(Set1,Set2) and Subset(Set2,Set1):
       return 1
    else:
       return 0

# directed graphs as Dictionaries of Sets
#   also only works for immutable nodes

def NewDG(pairlist):
    Result = {}
    for (source,dest) in pairlist:
        AddArc(Result, source, dest)
    return Result

def GetPairs(Graph):
    result = []
    Sources = Graph.keys()
    for S in Sources:
        Dests = get_elts( Graph[S] )
        ThesePairs = [None] * len(Dests)
        for i in range(0,len(Dests)):
            D = Dests[i]
            ThesePairs[i] = (S, D)
        result = result + ThesePairs
    return result

def AddArc(Graph, Source, Dest):
    change = 0
    if Graph.has_key(Source):
       Adjacent = Graph[Source]
       if not member(Dest,Adjacent):
          addMember(Dest,Adjacent)
          change = 1
    else:
       Graph[Source] = NewSet( [ Dest ] )
       change = 1
    return change

def Neighbors(Graph,Source):
    if Graph.has_key(Source):
       return get_elts(Graph[Source])
    else:
       return []

def HasArc(Graph, Source, Dest):
    result = 0
    if Graph.has_key(Source) and member(Dest, Graph[Source]):
       result = 1
    return result

def Sources(Graph):
    return Graph.keys()

# when G1, G2 and G3 are different graphs this results in
#   G1 = G1 U ( G2 o G3 )
# If G1 is identical to one of G2,G3 the result is somewhat
# nondeterministic (depends on dictionary implementation).
# However, guaranteed that AddComposition(G,G,G) returns
#    G1 U (G1 o G1) <= G <= TC(G1)
# where G1 is G's original value and TC(G1) is its transitive closure
# hence this function can be used for brute force transitive closure
#
def AddComposition(G1, G2, G3):
    change = 0
    for G2Source in Sources(G2):
        for Middle in Neighbors(G2,G2Source):
            for G3Dest in Neighbors(G3, Middle):
                if not HasArc(G1, G2Source, G3Dest):
                   change = 1
                   AddArc(G1, G2Source, G3Dest)
    return change

# in place transitive closure of a graph
def TransClose(Graph):
    change = AddComposition(Graph, Graph, Graph)
    somechange = change
    while change:
       change = AddComposition(Graph, Graph, Graph)
       if not somechange:
          somechange = change
    return somechange

########### SQueue stuff
#
#  A GrabBag should be used to hold objects temporarily for future
#  use.  You can put things in and take them out, with autodelete
#  that's all!

# make a new baggy with nothing in it
#   BG[0] is insert cursor BG[1] is delete cursor, others are elts
#
OLD = 1
NEW = 0
START = 2
def NewBG():
    B = [None]*8 #default size
    B[OLD] = START
    B[NEW] = START
    return B

def BGempty(B):
    # other ops must maintain this: old == new iff empty
    return B[OLD] == B[NEW]

# may return new, larger structure
# must be used with assignment...  B = BGadd(e,B)
def BGadd(elt, B):
    cursor = B[NEW]
    oldlen = len(B)
    # look for an available position
    while B[cursor] != None:
       cursor = cursor+1
       if cursor >= oldlen: cursor = START
       if cursor == B[NEW]: #back to beginning
          break
    # resize if wrapped
    if B[cursor] != None:
       B = B + [None] * oldlen
       cursor = oldlen
       B[OLD] = START
    if B[cursor] != None:
       raise IndexError, "can't insert?"
    # add the elt
    B[cursor] = (elt,)
    B[NEW] = cursor
    # B nonempty so OLD and NEW should differ.
    if B[OLD] == cursor:
       B[NEW] = cursor + 1
       if B[NEW]<=len(B): B[NEW] = START
    return B

def BGgetdel(B):
    # find something to delete:
    cursor = B[OLD]
    blen = len(B)
    while B[cursor]==None:
       cursor = cursor+1
       if cursor>=blen: cursor = START
       if cursor == B[OLD]: break # wrapped
    if B[cursor] == None:
       raise IndexError, "delete from empty grabbag(?)"
    # test to see if bag is empty (position cursor2 at nonempty slot)
    cursor2 = cursor+1
    if cursor2>=blen: cursor2 = START
    while B[cursor2]==None:
       cursor2 = cursor2+1
       if cursor2>=blen: cursor2 = START
       # since B[cursor] not yet deleted while will terminate
    # get and delete the elt
    (result,) = B[cursor]
    B[cursor] = None
    # cursor == cursor2 iff bag is empty
    B[OLD] = cursor2
    if B[NEW] == cursor2: B[NEW] = cursor
    return result

def BGtest(n):
    B = NewBG()
    rn = range(n)
    rn2 = range(n-2)
    for i in rn:
        for j in rn:
            B = BGadd( (i,j), B)
            B = BGadd( (j,i), B)
            x = BGgetdel(B)
        for j in rn2:
            y = BGgetdel(B)
        print (i, x, y)
    return B

=== Removed File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/Makefile ===