[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2 - LICENSE:1.1.2.1 Makefile:1.1.2.1 NOTES:1.1.2.1 yapps2.py:1.1.2.1 yapps2.tex:1.1.2.1 yappsrt.py:1.1.2.1
Andreas Jung
andreas@digicool.com
Fri, 11 Jan 2002 13:21:30 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2
In directory cvs.zope.org:/tmp/cvs-serv12830/queryparser/yapps2
Added Files:
Tag: ajung-textindexng-branch
LICENSE Makefile NOTES yapps2.py yapps2.tex yappsrt.py
Log Message:
added prelim. new parser for TextIndex queries
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/LICENSE ===
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/Makefile ===
ALL: $(shell echo *.g examples/*.g | sed s/.g/.py/g )
%.py: %.g yapps2.py yappsrt.py Makefile
python yapps2.py $<
DOC: yapps2.ps
yapps2.ps: yapps2.dvi
dvips -q yapps2.dvi -o yapps2.ps
yapps2.dvi: yapps2.tex
latex yapps2.tex
DISTRIB: ALL
zip -u yapps2.zip LICENSE yapps2.py yappsrt.py parsedesc.g examples/*.g NOTES yapps2.tex yapps2.ps Makefile
scp yapps2.zip iswim.stanford.edu:www/yapps2.zip
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/NOTES ===
Debugging mode
Output to English?
Optimize unused variables
Optimize flag that would use global lists & dicts (for set membership)
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/yapps2.py === (674/774 lines abridged)
# Yapps 2.0 - yet another python parser system
# Amit J Patel, January 1999
# See http://theory.stanford.edu/~amitp/Yapps/ for documentation and updates
# v2.0.1 changes (October 2001):
# * The exceptions inherit the standard Exception class (thanks Rich Salz)
# * The scanner can use either a different set of regular expressions
# per instance, or allows the subclass to define class fields with
# the patterns. This improves performance when many Scanner objects
# are being created, because the regular expressions don't have to
# be recompiled each time. (thanks Amaury Forgeot d'Arc)
from string import *
from yappsrt import *
import regex, re
INDENT = " "*4
class Generator:
def __init__(self, name, options, tokens, rules):
self.change_count = 0
self.name = name
self.options = options
self.preparser = ''
self.postparser = None
self.tokens = {} # Map from tokens to regexps
self.ignore = [] # List of token names to ignore in parsing
self.terminals = [] # List of token names (to maintain ordering)
for n,t in tokens:
if n == '#ignore':
n = t
self.ignore.append(n)
if n in self.tokens.keys() and self.tokens[n] != t:
print 'Warning: token', n, 'multiply defined.'
self.tokens[n] = t
self.terminals.append(n)
self.rules = {} # Map from rule names to parser nodes
self.params = {} # Map from rule names to parameters
self.goals = [] # List of rule names (to maintain ordering)
for n,p,r in rules:
self.params[n] = p
self.rules[n] = r
self.goals.append(n)
import sys
self.output = sys.stdout
def __getitem__(self, name):
[-=- -=- -=- 674 lines omitted -=- -=- -=-]
p = ParserDescription(ParserDescriptionScanner(s))
if not p: return
# Now parse the file
t = wrap_error_reporter(p, 'Parser')
if not t: return # Error
if preparser is not None: t.preparser = preparser
if postparser is not None: t.postparser = postparser
# Check the options
for f in t.options.keys():
for opt,_,_ in yapps_options:
if f == opt: break
else:
print 'Warning: unrecognized option', f
# Add command line options to the set
for f in flags.keys(): t.options[f] = flags[f]
# Generate the output
if dump:
t.dump_information()
else:
t.output = open(outputfilename, 'w')
t.generate_output()
if __name__=='__main__':
import sys, getopt
optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump'])
if not args or len(args) > 2:
print 'Usage:'
print ' python', sys.argv[0], '[flags] input.g [output.py]'
print 'Flags:'
print (' --dump' + ' '*40)[:35] + 'Dump out grammar information'
for flag, _, doc in yapps_options:
print (' -f' + flag + ' '*40)[:35] + doc
else:
# Read in the options and create a list of flags
flags = {}
for opt in optlist:
for flag, name, _ in yapps_options:
if opt == ('-f', flag):
flags[name] = 1
break
else:
if opt == ('--dump', ''):
flags['dump'] = 1
else:
print 'Warning - unrecognized option: ', opt[0], opt[1]
apply(generate, tuple(args), flags)
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/yapps2.tex === (1140/1240 lines abridged)
\documentclass[10pt]{article}
\usepackage{palatino}
%\usepackage{html}
\newcommand{\htmladdnormallink}[2]{#1 $\langle$#2$\rangle$}
\usepackage{color}
\usepackage{xspace}
\setlength{\headsep}{0in}
\setlength{\headheight}{0in}
\setlength{\textheight}{8.5in}
\setlength{\textwidth}{5.9in}
\setlength{\oddsidemargin}{0.25in}
\definecolor{darkblue}{rgb}{0,0,0.6}
\definecolor{darkerblue}{rgb}{0,0,0.3}
\newcommand{\mysection}[1]{\section{\textcolor{darkblue}{\textsf{#1}}}}
\newcommand{\mysubsection}[1]{\subsection{\textcolor{darkerblue}{\textsf{#1}}}}
%\bodytext{bgcolor=white text=black link=#004080 vlink=#006020}
\newcommand{\first}{\textsc{first}\xspace}
\newcommand{\follow}{\textsc{follow}\xspace}
\begin{document}
\begin{center}
\hfill \begin{tabular}{c}
{\Large The \emph{Yapps} Parser Generator System}\\
\verb|http://theory.stanford.edu/~amitp/Yapps/|\\
Version 2.0\\
\\
Amit J. Patel\\
\htmladdnormallink{}
{http://www-cs-students.stanford.edu/{\~{}}amitp/}
\end{tabular} \hfill \rule{0in}{0in}
\end{center}
\mysection{Introduction}
\emph{Yapps} (\underline{Y}et \underline{A}nother \underline{P}ython
\underline{P}arser \underline{S}ystem) is an easy to use parser
generator that is written in Python and generates Python code. There
are several parser generator systems already available for Python,
including \texttt{PyLR, kjParsing, PyBison,} and \texttt{mcf.pars,}
but I had different goals for my parser. Yapps is simple, is easy to
use, and produces human-readable parsers. It is not the fastest or
most powerful parser. Yapps is designed to be used when regular
expressions are not enough and other parser systems are too much:
[-=- -=- -=- 1140 lines omitted -=- -=- -=-]
somewhat ugly.
\item It would be nice to control choices with user-defined
predicates.
\item The most likely future extension is backtracking. A grammar
pattern like \verb|(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}| would turn into code that attempted to
match \verb|VAR ':=' expr|. If it succeeded, it would run \verb|{{ return ... }}|. If it failed, it would match \verb|expr {{ return expr }}|. Backtracking may make it less necessary to
write LL(2) grammars.
\item The SRE module used in Python 1.6+ includes an undocumented
feature that allows one to figure out which of many alternatives
matched in a regular expression. In Yapps 2, each token is checked
separately. With this feature of SRE, it's possible to construct a
single regular expression that scans all possible tokens
simultaneously. However, there is a difficulty with generating
this regular expression automatically. Unlike the ``normal'' rules
for regexp matches, alternatives are \emph{not} matched using the
``longest match'' rule. Thus,
\verb:re.match(r'foo|foobar','foobar').group(0): returns
\verb:'foo':, but
\verb:re.match(r'foobar|foo','foobar').group(0): returns
\verb:'foobar':! This means that we have to sort the tokens so that
the longest ones occur first. However, this isn't easy to do.
\end{enumerate}
\mysection{References}
\begin{enumerate}
\item ANTLR/PCCTS, by Terrence Parr, is available at\\
\htmladdnormallink{The ANTLR Home Page}{http://www.antlr.org/}.
\item PyLR, by Scott Cotton, is at \\
\htmladdnormallink{his Starship
page}{http://starship.skyport.net/crew/scott/PyLR.html}.
\item John Aycock's \htmladdnormallink{Compiling Little Languages
Framework}{http://www.csr.UVic.CA/{\~{}}aycock/python/}.
\item PyBison, by Scott Hassan, can be found at\\
\htmladdnormallink{his Python Projects
page}{http://coho.stanford.edu/{\~{}}hassan/Python/}.
\item mcf.pars, by Mike C. Fletcher, is available at\\
\htmladdnormallink{his web
page}{http://www.golden.net/{\~{}}mcfletch/programming/}.
\item kwParsing, by Aaron Watters, is available at \\
\htmladdnormallink{his Starship
page}{http://starship.skyport.net/crew/aaron\_watters/kwParsing/}.
\end{enumerate}
\end{document}
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/queryparser/yapps2/yappsrt.py ===
# Yapps 2.0 Runtime
#
# This module is needed to run generated parsers.
from string import *
import exceptions
import re
class SyntaxError(Exception):
"""When we run into an unexpected token, this is the exception to use"""
def __init__(self, pos=-1, msg="Bad Token"):
self.pos = pos
self.msg = msg
def __repr__(self):
if self.pos < 0: return "#<syntax-error>"
else: return "SyntaxError[@ char " + `self.pos` + ": " + self.msg + "]"
class NoMoreTokens(Exception):
"""Another exception object, for when we run out of tokens"""
pass
class Scanner:
def __init__(self, patterns, ignore, input):
"""Patterns is [(terminal,regex)...]
Ignore is [terminal,...];
Input is a string"""
self.tokens = []
self.restrictions = []
self.input = input
self.pos = 0
self.ignore = ignore
# The stored patterns are a pair (compiled regex,source
# regex). If the patterns variable passed in to the
# constructor is None, we assume that the class already has a
# proper .patterns list constructed
if patterns is not None:
self.patterns = []
for k,r in patterns:
self.patterns.append( (k, re.compile(r)) )
def token(self, i, restrict=0):
"""Get the i'th token, and if i is one past the end, then scan
for another token; restrict is a list of tokens that
are allowed, or 0 for any token."""
if i == len(self.tokens): self.scan(restrict)
if i < len(self.tokens):
# Make sure the restriction is more restricted
if restrict and self.restrictions[i]:
for r in restrict:
if r not in self.restrictions[i]:
raise "Unimplemented: restriction set changed"
return self.tokens[i]
raise NoMoreTokens()
def __repr__(self):
"""Print the last 10 tokens that have been scanned in"""
output = ''
for t in self.tokens[-10:]:
output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],`t[3]`)
return output
def scan(self, restrict):
"""Should scan another token and add it to the list, self.tokens,
and add the restriction to self.restrictions"""
# Keep looking for a token, ignoring any in self.ignore
while 1:
# Search the patterns for the longest match, with earlier
# tokens in the list having preference
best_match = -1
best_pat = '(error)'
for p, regexp in self.patterns:
# First check to see if we're ignoring this token
if restrict and p not in restrict and p not in self.ignore:
continue
m = regexp.match(self.input, self.pos)
if m and len(m.group(0)) > best_match:
# We got a match that's better than the previous one
best_pat = p
best_match = len(m.group(0))
# If we didn't find anything, raise an error
if best_pat == '(error)' and best_match < 0:
msg = "Bad Token"
if restrict:
msg = "Trying to find one of "+join(restrict,", ")
raise SyntaxError(self.pos, msg)
# If we found something that isn't to be ignored, return it
if best_pat not in self.ignore:
# Create a token with this data
token = (self.pos, self.pos+best_match, best_pat,
self.input[self.pos:self.pos+best_match])
self.pos = self.pos + best_match
# Only add this token if it's not in the list
# (to prevent looping)
if not self.tokens or token != self.tokens[-1]:
self.tokens.append(token)
self.restrictions.append(restrict)
return
else:
# This token should be ignored ..
self.pos = self.pos + best_match
class Parser:
def __init__(self, scanner):
self._scanner = scanner
self._pos = 0
def _peek(self, *types):
"""Returns the token type for lookahead; if there are any args
then the list of args is the set of token types to allow"""
tok = self._scanner.token(self._pos, types)
return tok[2]
def _scan(self, type):
"""Returns the matched text, and moves to the next token"""
tok = self._scanner.token(self._pos, [type])
if tok[2] != type:
raise SyntaxError(tok[0], 'Trying to find '+type)
self._pos = 1+self._pos
return tok[3]
def print_error(input, err, scanner):
"""This is a really dumb long function to print error messages nicely."""
p = err.pos
# Figure out the line number
line = count(input[:p], '\n')
print err.msg+" on line "+`line+1`+":"
# Now try printing part of the line
text = input[max(p-80,0):p+80]
p = p - max(p-80,0)
# Strip to the left
i = rfind(text[:p],'\n')
j = rfind(text[:p],'\r')
if i < 0 or (j < i and j >= 0): i = j
if i >= 0 and i < p:
p = p - i - 1
text = text[i+1:]
# Strip to the right
i = find(text,'\n',p)
j = find(text,'\r',p)
if i < 0 or (j < i and j >= 0): i = j
if i >= 0:
text = text[:i]
# Now shorten the text
while len(text) > 70 and p > 60:
# Cut off 10 chars
text = "..." + text[10:]
p = p - 7
# Now print the string, along with an indicator
print '> ',text
print '> ',' '*p + '^'
print 'List of nearby tokens:', scanner
def wrap_error_reporter(parser, rule):
try: return getattr(parser, rule)()
except SyntaxError, s:
input = parser._scanner.input
try:
print_error(input, s, parser._scanner)
except ImportError:
print 'Syntax Error',s.msg,'on line',1+count(input[:s.pos], '\n')
except NoMoreTokens:
print 'Could not complete parsing; stopped around here:'
print parser._scanner