[Zope-Checkins]
CVS: Zope/lib/python/third_party/docutils/docutils/parsers/rst
- __init__.py:1.1.4.1 roles.py:1.1.4.1 states.py:1.1.4.1
tableparser.py:1.1.4.1
Andreas Jung
andreas at andreas-jung.com
Fri Oct 29 15:08:23 EDT 2004
Update of /cvs-repository/Zope/lib/python/third_party/docutils/docutils/parsers/rst
In directory cvs.zope.org:/tmp/cvs-serv23727/lib/python/third_party/docutils/docutils/parsers/rst
Added Files:
Tag: Zope-2_7-branch
__init__.py roles.py states.py tableparser.py
Log Message:
moved docutils to lib/python/third_party
=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/__init__.py ===
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.
"""
This is ``docutils.parsers.rst`` package. It exports a single class, `Parser`,
the reStructuredText parser.
Usage
=====
1. Create a parser::
parser = docutils.parsers.rst.Parser()
Several optional arguments may be passed to modify the parser's behavior.
Please see `Customizing the Parser`_ below for details.
2. Gather input (a multi-line string), by reading a file or the standard
input::
input = sys.stdin.read()
3. Create a new empty `docutils.nodes.document` tree::
document = docutils.utils.new_document(source, settings)
See `docutils.utils.new_document()` for parameter details.
4. Run the parser, populating the document tree::
parser.parse(input, document)
Parser Overview
===============
The reStructuredText parser is implemented as a state machine, examining its
input one line at a time. To understand how the parser works, please first
become familiar with the `docutils.statemachine` module, then see the
`states` module.
Customizing the Parser
----------------------
Anything that isn't already customizable is that way simply because that type
of customizability hasn't been implemented yet. Patches welcome!
When instantiating an object of the `Parser` class, two parameters may be
passed: ``rfc2822`` and ``inliner``. Pass ``rfc2822=1`` to enable an initial
RFC-2822 style header block, parsed as a "field_list" element (with "class"
attribute set to "rfc2822"). Currently this is the only body-level element
which is customizable without subclassing. (Tip: subclass `Parser` and change
its "state_classes" and "initial_state" attributes to refer to new classes.
Contact the author if you need more details.)
The ``inliner`` parameter takes an instance of `states.Inliner` or a subclass.
It handles inline markup recognition. A common extension is the addition of
further implicit hyperlinks, like "RFC 2822". This can be done by subclassing
`states.Inliner`, adding a new method for the implicit markup, and adding a
``(pattern, method)`` pair to the "implicit_dispatch" attribute of the
subclass. See `states.Inliner.implicit_inline()` for details. Explicit
inline markup can be customized in a `states.Inliner` subclass via the
``patterns.initial`` and ``dispatch`` attributes (and new methods as
appropriate).
"""
__docformat__ = 'reStructuredText'
import docutils.parsers
import docutils.statemachine
from docutils.parsers.rst import states
from docutils import frontend
class Parser(docutils.parsers.Parser):
"""The reStructuredText parser."""
supported = ('restructuredtext', 'rst', 'rest', 'restx', 'rtxt', 'rstx')
"""Aliases this parser supports."""
settings_spec = (
'reStructuredText Parser Options',
None,
(('Recognize and link to standalone PEP references (like "PEP 258").',
['--pep-references'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),
('Base URL for PEP references '
'(default "http://www.python.org/peps/").',
['--pep-base-url'],
{'metavar': '<URL>', 'default': 'http://www.python.org/peps/',
'validator': frontend.validate_url_trailing_slash}),
('Recognize and link to standalone RFC references (like "RFC 822").',
['--rfc-references'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),
('Base URL for RFC references (default "http://www.faqs.org/rfcs/").',
['--rfc-base-url'],
{'metavar': '<URL>', 'default': 'http://www.faqs.org/rfcs/',
'validator': frontend.validate_url_trailing_slash}),
('Set number of spaces for tab expansion (default 8).',
['--tab-width'],
{'metavar': '<width>', 'type': 'int', 'default': 8}),
('Remove spaces before footnote references.',
['--trim-footnote-reference-space'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),))
config_section = 'restructuredtext parser'
config_section_dependencies = ('parsers',)
def __init__(self, rfc2822=None, inliner=None):
if rfc2822:
self.initial_state = 'RFC2822Body'
else:
self.initial_state = 'Body'
self.state_classes = states.state_classes
self.inliner = inliner
def parse(self, inputstring, document):
"""Parse `inputstring` and populate `document`, a document tree."""
self.setup_parse(inputstring, document)
debug = document.reporter[''].debug
self.statemachine = states.RSTStateMachine(
state_classes=self.state_classes,
initial_state=self.initial_state,
debug=debug)
inputlines = docutils.statemachine.string2lines(
inputstring, tab_width=document.settings.tab_width,
convert_whitespace=1)
self.statemachine.run(inputlines, document, inliner=self.inliner)
self.finish_parse()
=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/roles.py ===
# Author: Edward Loper
# Contact: edloper at gradient.cis.upenn.edu
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.
"""
This module defines standard interpreted text role functions, a registry for
interpreted text roles, and an API for adding to and retrieving from the
registry.
The interface for interpreted role functions is as follows::
def role_fn(name, rawtext, text, lineno, inliner,
options={}, content=[]):
code...
# Set function attributes for customization:
role_fn.options = ...
role_fn.content = ...
Parameters:
- ``name`` is the local name of the interpreted text role, the role name
actually used in the document.
- ``rawtext`` is a string containing the entire interpreted text construct.
Return it as a ``problematic`` node linked to a system message if there is a
problem.
- ``text`` is the interpreted text content.
- ``lineno`` is the line number where the interpreted text beings.
- ``inliner`` is the Inliner object that called the role function.
It defines the following useful attributes: ``reporter``,
``problematic``, ``memo``, ``parent``, ``document``.
- ``options``: A dictionary of directive options for customization, to be
interpreted by the role function. Used for additional attributes for the
generated elements and other functionality.
- ``content``: A list of strings, the directive content for customization
("role" directive). To be interpreted by the role function.
Function attributes for customization, interpreted by the "role" directive:
- ``options``: A dictionary, mapping known option names to conversion
functions such as `int` or `float`. ``None`` or an empty dict implies no
options to parse. Several directive option conversion functions are defined
in the `directives` module.
All role functions implicitly support the "class" option, unless disabled
with an explicit ``{'class': None}``.
- ``content``: A boolean; true if content is allowed. Client code must handle
the case where content is required but not supplied (an empty content list
will be supplied).
Note that unlike directives, the "arguments" function attribute is not
supported for role customization. Directive arguments are handled by the
"role" directive itself.
Interpreted role functions return a tuple of two values:
- A list of nodes which will be inserted into the document tree at the
point where the interpreted role was encountered (can be an empty
list).
- A list of system messages, which will be inserted into the document tree
immediately after the end of the current inline block (can also be empty).
"""
__docformat__ = 'reStructuredText'
from docutils import nodes
from docutils.parsers.rst import directives
from docutils.parsers.rst.languages import en as _fallback_language_module
DEFAULT_INTERPRETED_ROLE = 'title-reference'
"""
The canonical name of the default interpreted role. This role is used
when no role is specified for a piece of interpreted text.
"""
_role_registry = {}
"""Mapping of canonical role names to role functions. Language-dependent role
names are defined in the ``language`` subpackage."""
_roles = {}
"""Mapping of local or language-dependent interpreted text role names to role
functions."""
def role(role_name, language_module, lineno, reporter):
"""
Locate and return a role function from its language-dependent name, along
with a list of system messages. If the role is not found in the current
language, check English. Return a 2-tuple: role function (``None`` if the
named role cannot be found) and a list of system messages.
"""
normname = role_name.lower()
messages = []
msg_text = []
if _roles.has_key(normname):
return _roles[normname], messages
if role_name:
canonicalname = None
try:
canonicalname = language_module.roles[normname]
except AttributeError, error:
msg_text.append('Problem retrieving role entry from language '
'module %r: %s.' % (language_module, error))
except KeyError:
msg_text.append('No role entry for "%s" in module "%s".'
% (role_name, language_module.__name__))
else:
canonicalname = DEFAULT_INTERPRETED_ROLE
# If we didn't find it, try English as a fallback.
if not canonicalname:
try:
canonicalname = _fallback_language_module.roles[normname]
msg_text.append('Using English fallback for role "%s".'
% role_name)
except KeyError:
msg_text.append('Trying "%s" as canonical role name.'
% role_name)
# The canonical name should be an English name, but just in case:
canonicalname = normname
# Collect any messages that we generated.
if msg_text:
message = reporter.info('\n'.join(msg_text), line=lineno)
messages.append(message)
# Look the role up in the registry, and return it.
if _role_registry.has_key(canonicalname):
role_fn = _role_registry[canonicalname]
register_local_role(normname, role_fn)
return role_fn, messages
else:
return None, messages # Error message will be generated by caller.
def register_canonical_role(name, role_fn):
"""
Register an interpreted text role by its canonical name.
:Parameters:
- `name`: The canonical name of the interpreted role.
- `role_fn`: The role function. See the module docstring.
"""
set_implicit_options(role_fn)
_role_registry[name] = role_fn
def register_local_role(name, role_fn):
"""
Register an interpreted text role by its local or language-dependent name.
:Parameters:
- `name`: The local or language-dependent name of the interpreted role.
- `role_fn`: The role function. See the module docstring.
"""
set_implicit_options(role_fn)
_roles[name] = role_fn
def set_implicit_options(role_fn):
"""
Add customization options to role functions, unless explicitly set or
disabled.
"""
if not hasattr(role_fn, 'options') or role_fn.options is None:
role_fn.options = {'class': directives.class_option}
elif not role_fn.options.has_key('class'):
role_fn.options['class'] = directives.class_option
def register_generic_role(canonical_name, node_class):
"""For roles which simply wrap a given `node_class` around the text."""
role = GenericRole(canonical_name, node_class)
register_canonical_role(canonical_name, role)
class GenericRole:
"""
Generic interpreted text role, where the interpreted text is simply
wrapped with the provided node class.
"""
def __init__(self, role_name, node_class):
self.name = role_name
self.node_class = node_class
def __call__(self, role, rawtext, text, lineno, inliner,
options={}, content=[]):
return [self.node_class(rawtext, text, **options)], []
class CustomRole:
"""
Wrapper for custom interpreted text roles.
"""
def __init__(self, role_name, base_role, options={}, content=[]):
self.name = role_name
self.base_role = base_role
self.options = None
if hasattr(base_role, 'options'):
self.options = base_role.options
self.content = None
if hasattr(base_role, 'content'):
self.content = base_role.content
self.supplied_options = options
self.supplied_content = content
def __call__(self, role, rawtext, text, lineno, inliner,
options={}, content=[]):
opts = self.supplied_options.copy()
opts.update(options)
cont = list(self.supplied_content)
if cont and content:
cont += '\n'
cont.extend(content)
return self.base_role(role, rawtext, text, lineno, inliner,
options=opts, content=cont)
def generic_custom_role(role, rawtext, text, lineno, inliner,
options={}, content=[]):
""""""
# Once nested inline markup is implemented, this and other methods should
# recursively call inliner.nested_parse().
return [nodes.inline(rawtext, text, **options)], []
generic_custom_role.options = {'class': directives.class_option}
######################################################################
# Define and register the standard roles:
######################################################################
register_generic_role('abbreviation', nodes.abbreviation)
register_generic_role('acronym', nodes.acronym)
register_generic_role('emphasis', nodes.emphasis)
register_generic_role('literal', nodes.literal)
register_generic_role('strong', nodes.strong)
register_generic_role('subscript', nodes.subscript)
register_generic_role('superscript', nodes.superscript)
register_generic_role('title-reference', nodes.title_reference)
def pep_reference_role(role, rawtext, text, lineno, inliner,
options={}, content=[]):
try:
pepnum = int(text)
if pepnum < 0 or pepnum > 9999:
raise ValueError
except ValueError:
msg = inliner.reporter.error(
'PEP number must be a number from 0 to 9999; "%s" is invalid.'
% text, line=lineno)
prb = inliner.problematic(rawtext, rawtext, msg)
return [prb], [msg]
# Base URL mainly used by inliner.pep_reference; so this is correct:
ref = inliner.document.settings.pep_base_url + inliner.pep_url % pepnum
return [nodes.reference(rawtext, 'PEP ' + text, refuri=ref, **options)], []
register_canonical_role('pep-reference', pep_reference_role)
def rfc_reference_role(role, rawtext, text, lineno, inliner,
options={}, content=[]):
try:
rfcnum = int(text)
if rfcnum <= 0:
raise ValueError
except ValueError:
msg = inliner.reporter.error(
'RFC number must be a number greater than or equal to 1; '
'"%s" is invalid.' % text, line=lineno)
prb = inliner.problematic(rawtext, rawtext, msg)
return [prb], [msg]
# Base URL mainly used by inliner.rfc_reference, so this is correct:
ref = inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum
node = nodes.reference(rawtext, 'RFC ' + text, refuri=ref, **options)
return [node], []
register_canonical_role('rfc-reference', rfc_reference_role)
######################################################################
# Register roles that are currently unimplemented.
######################################################################
def unimplemented_role(role, rawtext, text, lineno, inliner, attributes={}):
msg = inliner.reporter.error(
'Interpreted text role "%s" not implemented.' % role, line=lineno)
prb = inliner.problematic(rawtext, rawtext, msg)
return [prb], [msg]
register_canonical_role('index', unimplemented_role)
register_canonical_role('named-reference', unimplemented_role)
register_canonical_role('anonymous-reference', unimplemented_role)
register_canonical_role('uri-reference', unimplemented_role)
register_canonical_role('footnote-reference', unimplemented_role)
register_canonical_role('citation-reference', unimplemented_role)
register_canonical_role('substitution-reference', unimplemented_role)
register_canonical_role('target', unimplemented_role)
# This should remain unimplemented, for testing purposes:
register_canonical_role('restructuredtext-unimplemented-role',
unimplemented_role)
=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/states.py === (2484/2884 lines abridged)
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.
"""
This is the ``docutils.parsers.restructuredtext.states`` module, the core of
the reStructuredText parser. It defines the following:
:Classes:
- `RSTStateMachine`: reStructuredText parser's entry point.
- `NestedStateMachine`: recursive StateMachine.
- `RSTState`: reStructuredText State superclass.
- `Inliner`: For parsing inline markup.
- `Body`: Generic classifier of the first line of a block.
- `SpecializedBody`: Superclass for compound element members.
- `BulletList`: Second and subsequent bullet_list list_items
- `DefinitionList`: Second+ definition_list_items.
- `EnumeratedList`: Second+ enumerated_list list_items.
- `FieldList`: Second+ fields.
- `OptionList`: Second+ option_list_items.
- `RFC2822List`: Second+ RFC2822-style fields.
- `ExtensionOptions`: Parses directive option fields.
- `Explicit`: Second+ explicit markup constructs.
- `SubstitutionDef`: For embedded directives in substitution definitions.
- `Text`: Classifier of second line of a text block.
- `SpecializedText`: Superclass for continuation lines of Text-variants.
- `Definition`: Second line of potential definition_list_item.
- `Line`: Second line of overlined section title or transition marker.
- `Struct`: An auxiliary collection class.
:Exception classes:
- `MarkupError`
- `ParserError`
- `MarkupMismatch`
:Functions:
- `escape2null()`: Return a string, escape-backslashes converted to nulls.
- `unescape()`: Return a string, nulls removed or restored to backslashes.
:Attributes:
- `state_classes`: set of State classes used with `RSTStateMachine`.
Parser Overview
===============
The reStructuredText parser is implemented as a recursive state machine,
examining its input one line at a time. To understand how the parser works,
please first become familiar with the `docutils.statemachine` module. In the
description below, references are made to classes defined in this module;
please see the individual classes for details.
Parsing proceeds as follows:
1. The state machine examines each line of input, checking each of the
transition patterns of the state `Body`, in order, looking for a match.
The implicit transitions (blank lines and indentation) are checked before
any others. The 'text' transition is a catch-all (matches anything).
2. The method associated with the matched transition pattern is called.
A. Some transition methods are self-contained, appending elements to the
document tree (`Body.doctest` parses a doctest block). The parser's
current line index is advanced to the end of the element, and parsing
continues with step 1.
B. Other transition methods trigger the creation of a nested state machine,
whose job is to parse a compound construct ('indent' does a block quote,
'bullet' does a bullet list, 'overline' does a section [first checking
for a valid section header], etc.).
- In the case of lists and explicit markup, a one-off state machine is
created and run to parse contents of the first item.
- A new state machine is created and its initial state is set to the
appropriate specialized state (`BulletList` in the case of the
'bullet' transition; see `SpecializedBody` for more detail). This
state machine is run to parse the compound element (or series of
explicit markup elements), and returns as soon as a non-member element
is encountered. For example, the `BulletList` state machine ends as
soon as it encounters an element which is not a list item of that
bullet list. The optional omission of inter-element blank lines is
enabled by this nested state machine.
- The current line index is advanced to the end of the elements parsed,
and parsing continues with step 1.
C. The result of the 'text' transition depends on the next line of text.
The current state is changed to `Text`, under which the second line is
examined. If the second line is:
- Indented: The element is a definition list item, and parsing proceeds
similarly to step 2.B, using the `DefinitionList` state.
- A line of uniform punctuation characters: The element is a section
header; again, parsing proceeds as in step 2.B, and `Body` is still
used.
- Anything else: The element is a paragraph, which is examined for
inline markup and appended to the parent element. Processing
continues with step 1.
"""
__docformat__ = 'reStructuredText'
import sys
import re
import roman
from types import TupleType
from docutils import nodes, statemachine, utils, urischemes
from docutils import ApplicationError, DataError
from docutils.statemachine import StateMachineWS, StateWS
from docutils.nodes import fully_normalize_name as normalize_name
from docutils.nodes import whitespace_normalize_name
from docutils.parsers.rst import directives, languages, tableparser, roles
from docutils.parsers.rst.languages import en as _fallback_language_module
class MarkupError(DataError): pass
class UnknownInterpretedRoleError(DataError): pass
class InterpretedRoleNotImplementedError(DataError): pass
class ParserError(ApplicationError): pass
class MarkupMismatch(Exception): pass
class Struct:
"""Stores data attributes for dotted-attribute access."""
def __init__(self, **keywordargs):
self.__dict__.update(keywordargs)
class RSTStateMachine(StateMachineWS):
"""
reStructuredText's master StateMachine.
The entry point to reStructuredText parsing is the `run()` method.
"""
def run(self, input_lines, document, input_offset=0, match_titles=1,
inliner=None):
"""
Parse `input_lines` and return a `docutils.nodes.document` instance.
Extend `StateMachineWS.run()`: set up parse-global data, run the
StateMachine, and return the resulting
document.
"""
self.language = languages.get_language(
document.settings.language_code)
self.match_titles = match_titles
if inliner is None:
inliner = Inliner()
inliner.init_customizations(document.settings)
self.memo = Struct(document=document,
reporter=document.reporter,
language=self.language,
title_styles=[],
section_level=0,
section_bubble_up_kludge=0,
inliner=inliner)
self.document = document
self.attach_observer(document.note_source)
self.reporter = self.memo.reporter
self.node = document
results = StateMachineWS.run(self, input_lines, input_offset,
input_source=document['source'])
assert results == [], 'RSTStateMachine.run() results should be empty!'
self.check_document()
self.node = self.memo = None # remove unneeded references
def check_document(self):
"""Check for illegal structure: empty document."""
if len(self.document) == 0:
error = self.reporter.error(
'Document empty; must have contents.', line=0)
self.document += error
class NestedStateMachine(StateMachineWS):
"""
StateMachine run from within other StateMachine runs, to parse nested
document structures.
"""
def run(self, input_lines, input_offset, memo, node, match_titles=1):
"""
Parse `input_lines` and populate a `docutils.nodes.document` instance.
Extend `StateMachineWS.run()`: set up document-wide data.
"""
self.match_titles = match_titles
self.memo = memo
self.document = memo.document
self.attach_observer(self.document.note_source)
[-=- -=- -=- 2484 lines omitted -=- -=- -=-]
if len(self.parent) == 0:
msg = self.reporter.error(
'Document or section may not begin with a transition.',
line=lineno)
self.parent += msg
elif isinstance(self.parent[-1], nodes.transition):
msg = self.reporter.error(
'At least one body element must separate transitions; '
'adjacent transitions not allowed.',
line=lineno)
self.parent += msg
self.parent += transition
return [], 'Body', []
def text(self, match, context, next_state):
"""Potential over- & underlined title."""
lineno = self.state_machine.abs_line_number() - 1
overline = context[0]
title = match.string
underline = ''
try:
underline = self.state_machine.next_line()
except EOFError:
blocktext = overline + '\n' + title
if len(overline.rstrip()) < 4:
self.short_overline(context, blocktext, lineno, 2)
else:
msg = self.reporter.severe(
'Incomplete section title.',
nodes.literal_block(blocktext, blocktext), line=lineno)
self.parent += msg
return [], 'Body', []
source = '%s\n%s\n%s' % (overline, title, underline)
overline = overline.rstrip()
underline = underline.rstrip()
if not self.transitions['underline'][0].match(underline):
blocktext = overline + '\n' + title + '\n' + underline
if len(overline.rstrip()) < 4:
self.short_overline(context, blocktext, lineno, 2)
else:
msg = self.reporter.severe(
'Missing matching underline for section title overline.',
nodes.literal_block(source, source), line=lineno)
self.parent += msg
return [], 'Body', []
elif overline != underline:
blocktext = overline + '\n' + title + '\n' + underline
if len(overline.rstrip()) < 4:
self.short_overline(context, blocktext, lineno, 2)
else:
msg = self.reporter.severe(
'Title overline & underline mismatch.',
nodes.literal_block(source, source), line=lineno)
self.parent += msg
return [], 'Body', []
title = title.rstrip()
messages = []
if len(title) > len(overline):
blocktext = overline + '\n' + title + '\n' + underline
if len(overline.rstrip()) < 4:
self.short_overline(context, blocktext, lineno, 2)
else:
msg = self.reporter.warning(
'Title overline too short.',
nodes.literal_block(source, source), line=lineno)
messages.append(msg)
style = (overline[0], underline[0])
self.eofcheck = 0 # @@@ not sure this is correct
self.section(title.lstrip(), source, style, lineno + 1, messages)
self.eofcheck = 1
return [], 'Body', []
indent = text # indented title
def underline(self, match, context, next_state):
overline = context[0]
blocktext = overline + '\n' + self.state_machine.line
lineno = self.state_machine.abs_line_number() - 1
if len(overline.rstrip()) < 4:
self.short_overline(context, blocktext, lineno, 1)
msg = self.reporter.error(
'Invalid section title or transition marker.',
nodes.literal_block(blocktext, blocktext), line=lineno)
self.parent += msg
return [], 'Body', []
def short_overline(self, context, blocktext, lineno, lines=1):
msg = self.reporter.info(
'Possible incomplete section title.\nTreating the overline as '
"ordinary text because it's so short.", line=lineno)
self.parent += msg
self.state_correction(context, lines)
def state_correction(self, context, lines=1):
self.state_machine.previous_line(lines)
context[:] = []
raise statemachine.StateCorrection('Body', 'text')
class QuotedLiteralBlock(RSTState):
"""
Nested parse handler for quoted (unindented) literal blocks.
Special-purpose. Not for inclusion in `state_classes`.
"""
patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
'text': r''}
initial_transitions = ('initial_quoted', 'text')
def __init__(self, state_machine, debug=0):
RSTState.__init__(self, state_machine, debug)
self.messages = []
self.initial_lineno = None
def blank(self, match, context, next_state):
if context:
raise EOFError
else:
return context, next_state, []
def eof(self, context):
if context:
text = '\n'.join(context)
literal_block = nodes.literal_block(text, text)
literal_block.line = self.initial_lineno
self.parent += literal_block
else:
self.parent += self.reporter.warning(
'Literal block expected; none found.',
line=self.state_machine.abs_line_number())
self.state_machine.previous_line()
self.parent += self.messages
return []
def indent(self, match, context, next_state):
assert context, ('QuotedLiteralBlock.indent: context should not '
'be empty!')
self.messages.append(
self.reporter.error('Unexpected indentation.',
line=self.state_machine.abs_line_number()))
self.state_machine.previous_line()
raise EOFError
def initial_quoted(self, match, context, next_state):
"""Match arbitrary quote character on the first line only."""
self.remove_transition('initial_quoted')
quote = match.string[0]
pattern = re.compile(re.escape(quote))
# New transition matches consistent quotes only:
self.add_transition('quoted',
(pattern, self.quoted, self.__class__.__name__))
self.initial_lineno = self.state_machine.abs_line_number()
return [match.string], next_state, []
def quoted(self, match, context, next_state):
"""Match consistent quotes on subsequent lines."""
context.append(match.string)
return context, next_state, []
def text(self, match, context, next_state):
if context:
self.messages.append(
self.reporter.error('Inconsistent literal block quoting.',
line=self.state_machine.abs_line_number()))
self.state_machine.previous_line()
raise EOFError
state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
OptionList, ExtensionOptions, Explicit, Text, Definition,
Line, SubstitutionDef, RFC2822Body, RFC2822List)
"""Standard set of State classes used to start `RSTStateMachine`."""
def escape2null(text):
"""Return a string with escape-backslashes converted to nulls."""
parts = []
start = 0
while 1:
found = text.find('\\', start)
if found == -1:
parts.append(text[start:])
return ''.join(parts)
parts.append(text[start:found])
parts.append('\x00' + text[found+1:found+2])
start = found + 2 # skip character after escape
def unescape(text, restore_backslashes=0):
"""
Return a string with nulls removed or restored to backslashes.
Backslash-escaped spaces are also removed.
"""
if restore_backslashes:
return text.replace('\x00', '\\')
else:
for sep in ['\x00 ', '\x00\n', '\x00']:
text = ''.join(text.split(sep))
return text
=== Added File Zope/lib/python/third_party/docutils/docutils/parsers/rst/tableparser.py ===
# Author: David Goodger
# Contact: goodger at users.sourceforge.net
# Revision: $Revision: 1.1.4.1 $
# Date: $Date: 2004/10/29 19:08:22 $
# Copyright: This module has been placed in the public domain.
"""
This module defines table parser classes,which parse plaintext-graphic tables
and produce a well-formed data structure suitable for building a CALS table.
:Classes:
- `GridTableParser`: Parse fully-formed tables represented with a grid.
- `SimpleTableParser`: Parse simple tables, delimited by top & bottom
borders.
:Exception class: `TableMarkupError`
:Function:
`update_dict_of_lists()`: Merge two dictionaries containing list values.
"""
__docformat__ = 'reStructuredText'
import re
import sys
from docutils import DataError
class TableMarkupError(DataError): pass
class TableParser:
"""
Abstract superclass for the common parts of the syntax-specific parsers.
"""
head_body_separator_pat = None
"""Matches the row separator between head rows and body rows."""
def parse(self, block):
"""
Analyze the text `block` and return a table data structure.
Given a plaintext-graphic table in `block` (list of lines of text; no
whitespace padding), parse the table, construct and return the data
necessary to construct a CALS table or equivalent.
Raise `TableMarkupError` if there is any problem with the markup.
"""
self.setup(block)
self.find_head_body_sep()
self.parse_table()
structure = self.structure_from_cells()
return structure
def find_head_body_sep(self):
"""Look for a head/body row separator line; store the line index."""
for i in range(len(self.block)):
line = self.block[i]
if self.head_body_separator_pat.match(line):
if self.head_body_sep:
raise TableMarkupError(
'Multiple head/body row separators in table (at line '
'offset %s and %s); only one allowed.'
% (self.head_body_sep, i))
else:
self.head_body_sep = i
self.block[i] = line.replace('=', '-')
if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
- 1):
raise TableMarkupError('The head/body row separator may not be '
'the first or last line of the table.')
class GridTableParser(TableParser):
"""
Parse a grid table using `parse()`.
Here's an example of a grid table::
+------------------------+------------+----------+----------+
| Header row, column 1 | Header 2 | Header 3 | Header 4 |
+========================+============+==========+==========+
| body row 1, column 1 | column 2 | column 3 | column 4 |
+------------------------+------------+----------+----------+
| body row 2 | Cells may span columns. |
+------------------------+------------+---------------------+
| body row 3 | Cells may | - Table cells |
+------------------------+ span rows. | - contain |
| body row 4 | | - body elements. |
+------------------------+------------+---------------------+
Intersections use '+', row separators use '-' (except for one optional
head/body row separator, which uses '='), and column separators use '|'.
Passing the above table to the `parse()` method will result in the
following data structure::
([24, 12, 10, 10],
[[(0, 0, 1, ['Header row, column 1']),
(0, 0, 1, ['Header 2']),
(0, 0, 1, ['Header 3']),
(0, 0, 1, ['Header 4'])]],
[[(0, 0, 3, ['body row 1, column 1']),
(0, 0, 3, ['column 2']),
(0, 0, 3, ['column 3']),
(0, 0, 3, ['column 4'])],
[(0, 0, 5, ['body row 2']),
(0, 2, 5, ['Cells may span columns.']),
None,
None],
[(0, 0, 7, ['body row 3']),
(1, 0, 7, ['Cells may', 'span rows.', '']),
(1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
None],
[(0, 0, 9, ['body row 4']), None, None, None]])
The first item is a list containing column widths (colspecs). The second
item is a list of head rows, and the third is a list of body rows. Each
row contains a list of cells. Each cell is either None (for a cell unused
because of another cell's span), or a tuple. A cell tuple contains four
items: the number of extra rows used by the cell in a vertical span
(morerows); the number of extra columns used by the cell in a horizontal
span (morecols); the line offset of the first line of the cell contents;
and the cell contents, a list of lines of text.
"""
head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
def setup(self, block):
self.block = block[:] # make a copy; it may be modified
self.block.disconnect() # don't propagate changes to parent
self.bottom = len(block) - 1
self.right = len(block[0]) - 1
self.head_body_sep = None
self.done = [-1] * len(block[0])
self.cells = []
self.rowseps = {0: [0]}
self.colseps = {0: [0]}
def parse_table(self):
"""
Start with a queue of upper-left corners, containing the upper-left
corner of the table itself. Trace out one rectangular cell, remember
it, and add its upper-right and lower-left corners to the queue of
potential upper-left corners of further cells. Process the queue in
top-to-bottom order, keeping track of how much of each text column has
been seen.
We'll end up knowing all the row and column boundaries, cell positions
and their dimensions.
"""
corners = [(0, 0)]
while corners:
top, left = corners.pop(0)
if top == self.bottom or left == self.right \
or top <= self.done[left]:
continue
result = self.scan_cell(top, left)
if not result:
continue
bottom, right, rowseps, colseps = result
update_dict_of_lists(self.rowseps, rowseps)
update_dict_of_lists(self.colseps, colseps)
self.mark_done(top, left, bottom, right)
cellblock = self.block.get_2D_block(top + 1, left + 1,
bottom, right)
cellblock.disconnect() # lines in cell can't sync with parent
self.cells.append((top, left, bottom, right, cellblock))
corners.extend([(top, right), (bottom, left)])
corners.sort()
if not self.check_parse_complete():
raise TableMarkupError('Malformed table; parse incomplete.')
def mark_done(self, top, left, bottom, right):
"""For keeping track of how much of each text column has been seen."""
before = top - 1
after = bottom - 1
for col in range(left, right):
assert self.done[col] == before
self.done[col] = after
def check_parse_complete(self):
"""Each text column should have been completely seen."""
last = self.bottom - 1
for col in range(self.right):
if self.done[col] != last:
return None
return 1
def scan_cell(self, top, left):
"""Starting at the top-left corner, start tracing out a cell."""
assert self.block[top][left] == '+'
result = self.scan_right(top, left)
return result
def scan_right(self, top, left):
"""
Look for the top-right corner of the cell, and make note of all column
boundaries ('+').
"""
colseps = {}
line = self.block[top]
for i in range(left + 1, self.right + 1):
if line[i] == '+':
colseps[i] = [top]
result = self.scan_down(top, left, i)
if result:
bottom, rowseps, newcolseps = result
update_dict_of_lists(colseps, newcolseps)
return bottom, i, rowseps, colseps
elif line[i] != '-':
return None
return None
def scan_down(self, top, left, right):
"""
Look for the bottom-right corner of the cell, making note of all row
boundaries.
"""
rowseps = {}
for i in range(top + 1, self.bottom + 1):
if self.block[i][right] == '+':
rowseps[i] = [right]
result = self.scan_left(top, left, i, right)
if result:
newrowseps, colseps = result
update_dict_of_lists(rowseps, newrowseps)
return i, rowseps, colseps
elif self.block[i][right] != '|':
return None
return None
def scan_left(self, top, left, bottom, right):
"""
Noting column boundaries, look for the bottom-left corner of the cell.
It must line up with the starting point.
"""
colseps = {}
line = self.block[bottom]
for i in range(right - 1, left, -1):
if line[i] == '+':
colseps[i] = [bottom]
elif line[i] != '-':
return None
if line[left] != '+':
return None
result = self.scan_up(top, left, bottom, right)
if result is not None:
rowseps = result
return rowseps, colseps
return None
def scan_up(self, top, left, bottom, right):
"""
Noting row boundaries, see if we can return to the starting point.
"""
rowseps = {}
for i in range(bottom - 1, top, -1):
if self.block[i][left] == '+':
rowseps[i] = [left]
elif self.block[i][left] != '|':
return None
return rowseps
def structure_from_cells(self):
"""
From the data collected by `scan_cell()`, convert to the final data
structure.
"""
rowseps = self.rowseps.keys() # list of row boundaries
rowseps.sort()
rowindex = {}
for i in range(len(rowseps)):
rowindex[rowseps[i]] = i # row boundary -> row number mapping
colseps = self.colseps.keys() # list of column boundaries
colseps.sort()
colindex = {}
for i in range(len(colseps)):
colindex[colseps[i]] = i # column boundary -> col number map
colspecs = [(colseps[i] - colseps[i - 1] - 1)
for i in range(1, len(colseps))] # list of column widths
# prepare an empty table with the correct number of rows & columns
onerow = [None for i in range(len(colseps) - 1)]
rows = [onerow[:] for i in range(len(rowseps) - 1)]
# keep track of # of cells remaining; should reduce to zero
remaining = (len(rowseps) - 1) * (len(colseps) - 1)
for top, left, bottom, right, block in self.cells:
rownum = rowindex[top]
colnum = colindex[left]
assert rows[rownum][colnum] is None, (
'Cell (row %s, column %s) already used.'
% (rownum + 1, colnum + 1))
morerows = rowindex[bottom] - rownum - 1
morecols = colindex[right] - colnum - 1
remaining -= (morerows + 1) * (morecols + 1)
# write the cell into the table
rows[rownum][colnum] = (morerows, morecols, top + 1, block)
assert remaining == 0, 'Unused cells remaining.'
if self.head_body_sep: # separate head rows from body rows
numheadrows = rowindex[self.head_body_sep]
headrows = rows[:numheadrows]
bodyrows = rows[numheadrows:]
else:
headrows = []
bodyrows = rows
return (colspecs, headrows, bodyrows)
class SimpleTableParser(TableParser):
"""
Parse a simple table using `parse()`.
Here's an example of a simple table::
===== =====
col 1 col 2
===== =====
1 Second column of row 1.
2 Second column of row 2.
Second line of paragraph.
3 - Second column of row 3.
- Second item in bullet
list (row 3, column 2).
4 is a span
------------
5
===== =====
Top and bottom borders use '=', column span underlines use '-', column
separation is indicated with spaces.
Passing the above table to the `parse()` method will result in the
following data structure, whose interpretation is the same as for
`GridTableParser`::
([5, 25],
[[(0, 0, 1, ['col 1']),
(0, 0, 1, ['col 2'])]],
[[(0, 0, 3, ['1']),
(0, 0, 3, ['Second column of row 1.'])],
[(0, 0, 4, ['2']),
(0, 0, 4, ['Second column of row 2.',
'Second line of paragraph.'])],
[(0, 0, 6, ['3']),
(0, 0, 6, ['- Second column of row 3.',
'',
'- Second item in bullet',
' list (row 3, column 2).'])],
[(0, 1, 10, ['4 is a span'])],
[(0, 0, 12, ['5']),
(0, 0, 12, [''])]])
"""
head_body_separator_pat = re.compile('=[ =]*$')
span_pat = re.compile('-[ -]*$')
def setup(self, block):
self.block = block[:] # make a copy; it will be modified
self.block.disconnect() # don't propagate changes to parent
# Convert top & bottom borders to column span underlines:
self.block[0] = self.block[0].replace('=', '-')
self.block[-1] = self.block[-1].replace('=', '-')
self.head_body_sep = None
self.columns = []
self.border_end = None
self.table = []
self.done = [-1] * len(block[0])
self.rowseps = {0: [0]}
self.colseps = {0: [0]}
def parse_table(self):
"""
First determine the column boundaries from the top border, then
process rows. Each row may consist of multiple lines; accumulate
lines until a row is complete. Call `self.parse_row` to finish the
job.
"""
# Top border must fully describe all table columns.
self.columns = self.parse_columns(self.block[0], 0)
self.border_end = self.columns[-1][1]
firststart, firstend = self.columns[0]
offset = 1 # skip top border
start = 1
text_found = None
while offset < len(self.block):
line = self.block[offset]
if self.span_pat.match(line):
# Column span underline or border; row is complete.
self.parse_row(self.block[start:offset], start,
(line.rstrip(), offset))
start = offset + 1
text_found = None
elif line[firststart:firstend].strip():
# First column not blank, therefore it's a new row.
if text_found and offset != start:
self.parse_row(self.block[start:offset], start)
start = offset
text_found = 1
elif not text_found:
start = offset + 1
offset += 1
def parse_columns(self, line, offset):
"""
Given a column span underline, return a list of (begin, end) pairs.
"""
cols = []
end = 0
while 1:
begin = line.find('-', end)
end = line.find(' ', begin)
if begin < 0:
break
if end < 0:
end = len(line)
cols.append((begin, end))
if self.columns:
if cols[-1][1] != self.border_end:
raise TableMarkupError('Column span incomplete at line '
'offset %s.' % offset)
# Allow for an unbounded rightmost column:
cols[-1] = (cols[-1][0], self.columns[-1][1])
return cols
def init_row(self, colspec, offset):
i = 0
cells = []
for start, end in colspec:
morecols = 0
try:
assert start == self.columns[i][0]
while end != self.columns[i][1]:
i += 1
morecols += 1
except (AssertionError, IndexError):
raise TableMarkupError('Column span alignment problem at '
'line offset %s.' % (offset + 1))
cells.append([0, morecols, offset, []])
i += 1
return cells
def parse_row(self, lines, start, spanline=None):
"""
Given the text `lines` of a row, parse it and append to `self.table`.
The row is parsed according to the current column spec (either
`spanline` if provided or `self.columns`). For each column, extract
text from each line, and check for text in column margins. Finally,
adjust for insigificant whitespace.
"""
if not (lines or spanline):
# No new row, just blank lines.
return
if spanline:
columns = self.parse_columns(*spanline)
span_offset = spanline[1]
else:
columns = self.columns[:]
span_offset = start
self.check_columns(lines, start, columns)
row = self.init_row(columns, start)
for i in range(len(columns)):
start, end = columns[i]
cellblock = lines.get_2D_block(0, start, len(lines), end)
cellblock.disconnect() # lines in cell can't sync with parent
row[i][3] = cellblock
self.table.append(row)
def check_columns(self, lines, first_line, columns):
"""
Check for text in column margins and text overflow in the last column.
Raise TableMarkupError if anything but whitespace is in column margins.
Adjust the end value for the last column if there is text overflow.
"""
# "Infinite" value for a dummy last column's beginning, used to
# check for text overflow:
columns.append((sys.maxint, None))
lastcol = len(columns) - 2
for i in range(len(columns) - 1):
start, end = columns[i]
nextstart = columns[i+1][0]
offset = 0
for line in lines:
if i == lastcol and line[end:].strip():
text = line[start:].rstrip()
new_end = start + len(text)
columns[i] = (start, new_end)
main_start, main_end = self.columns[-1]
if new_end > main_end:
self.columns[-1] = (main_start, new_end)
elif line[end:nextstart].strip():
raise TableMarkupError('Text in column margin at line '
'offset %s.' % (first_line + offset))
offset += 1
columns.pop()
def structure_from_cells(self):
colspecs = [end - start for start, end in self.columns]
first_body_row = 0
if self.head_body_sep:
for i in range(len(self.table)):
if self.table[i][0][2] > self.head_body_sep:
first_body_row = i
break
return (colspecs, self.table[:first_body_row],
self.table[first_body_row:])
def update_dict_of_lists(master, newdata):
"""
Extend the list values of `master` with those from `newdata`.
Both parameters must be dictionaries containing list values.
"""
for key, values in newdata.items():
master.setdefault(key, []).extend(values)
More information about the Zope-Checkins
mailing list