[Zope-Checkins] CVS: Zope/lib/python/docutils/parsers/rst -
__init__.py:1.5 states.py:1.5 tableparser.py:1.5
Andreas Jung
cvs-admin at zope.org
Sun Nov 30 10:07:47 EST 2003
Update of /cvs-repository/Zope/lib/python/docutils/parsers/rst
In directory cvs.zope.org:/tmp/cvs-serv31532/parsers/rst
Added Files:
__init__.py states.py tableparser.py
Log Message:
updated
=== Zope/lib/python/docutils/parsers/rst/__init__.py 1.4 => 1.5 ===
--- /dev/null Sun Nov 30 10:07:47 2003
+++ Zope/lib/python/docutils/parsers/rst/__init__.py Sun Nov 30 10:07:46 2003
@@ -0,0 +1,127 @@
+# Author: David Goodger
+# Contact: goodger at users.sourceforge.net
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This module has been placed in the public domain.
+
+"""
+This is ``docutils.parsers.rst`` package. It exports a single class, `Parser`,
+the reStructuredText parser.
+
+
+Usage
+=====
+
+1. Create a parser::
+
+ parser = docutils.parsers.rst.Parser()
+
+ Several optional arguments may be passed to modify the parser's behavior.
+ Please see `Customizing the Parser`_ below for details.
+
+2. Gather input (a multi-line string), by reading a file or the standard
+ input::
+
+ input = sys.stdin.read()
+
+3. Create a new empty `docutils.nodes.document` tree::
+
+ document = docutils.utils.new_document(source, settings)
+
+ See `docutils.utils.new_document()` for parameter details.
+
+4. Run the parser, populating the document tree::
+
+ parser.parse(input, document)
+
+
+Parser Overview
+===============
+
+The reStructuredText parser is implemented as a state machine, examining its
+input one line at a time. To understand how the parser works, please first
+become familiar with the `docutils.statemachine` module, then see the
+`states` module.
+
+
+Customizing the Parser
+----------------------
+
+Anything that isn't already customizable is that way simply because that type
+of customizability hasn't been implemented yet. Patches welcome!
+
+When instantiating an object of the `Parser` class, two parameters may be
+passed: ``rfc2822`` and ``inliner``. Pass ``rfc2822=1`` to enable an initial
+RFC-2822 style header block, parsed as a "field_list" element (with "class"
+attribute set to "rfc2822"). Currently this is the only body-level element
+which is customizable without subclassing. (Tip: subclass `Parser` and change
+its "state_classes" and "initial_state" attributes to refer to new classes.
+Contact the author if you need more details.)
+
+The ``inliner`` parameter takes an instance of `states.Inliner` or a subclass.
+It handles inline markup recognition. A common extension is the addition of
+further implicit hyperlinks, like "RFC 2822". This can be done by subclassing
+`states.Inliner`, adding a new method for the implicit markup, and adding a
+``(pattern, method)`` pair to the "implicit_dispatch" attribute of the
+subclass. See `states.Inliner.implicit_inline()` for details. Explicit
+inline markup can be customized in a `states.Inliner` subclass via the
+``patterns.initial`` and ``dispatch`` attributes (and new methods as
+appropriate).
+"""
+
+__docformat__ = 'reStructuredText'
+
+
+import docutils.parsers
+import docutils.statemachine
+from docutils.parsers.rst import states
+from docutils import frontend
+
+
+class Parser(docutils.parsers.Parser):
+
+ """The reStructuredText parser."""
+
+ supported = ('restructuredtext', 'rst', 'rest', 'restx', 'rtxt', 'rstx')
+ """Aliases this parser supports."""
+
+ settings_spec = (
+ 'reStructuredText Parser Options',
+ None,
+ (('Recognize and link to PEP references (like "PEP 258").',
+ ['--pep-references'],
+ {'action': 'store_true', 'validator': frontend.validate_boolean}),
+ ('Recognize and link to RFC references (like "RFC 822").',
+ ['--rfc-references'],
+ {'action': 'store_true', 'validator': frontend.validate_boolean}),
+ ('Set number of spaces for tab expansion (default 8).',
+ ['--tab-width'],
+ {'metavar': '<width>', 'type': 'int', 'default': 8}),
+ ('Remove spaces before footnote references.',
+ ['--trim-footnote-reference-space'],
+ {'action': 'store_true', 'validator': frontend.validate_boolean}),))
+
+ config_section = 'restructuredtext parser'
+ config_section_dependencies = ('parsers',)
+
+ def __init__(self, rfc2822=None, inliner=None):
+ if rfc2822:
+ self.initial_state = 'RFC2822Body'
+ else:
+ self.initial_state = 'Body'
+ self.state_classes = states.state_classes
+ self.inliner = inliner
+
+ def parse(self, inputstring, document):
+ """Parse `inputstring` and populate `document`, a document tree."""
+ self.setup_parse(inputstring, document)
+ debug = document.reporter[''].debug
+ self.statemachine = states.RSTStateMachine(
+ state_classes=self.state_classes,
+ initial_state=self.initial_state,
+ debug=debug)
+ inputlines = docutils.statemachine.string2lines(
+ inputstring, tab_width=document.settings.tab_width,
+ convert_whitespace=1)
+ self.statemachine.run(inputlines, document, inliner=self.inliner)
+ self.finish_parse()
=== Zope/lib/python/docutils/parsers/rst/states.py 1.4 => 1.5 === (2525/2925 lines abridged)
--- /dev/null Sun Nov 30 10:07:47 2003
+++ Zope/lib/python/docutils/parsers/rst/states.py Sun Nov 30 10:07:46 2003
@@ -0,0 +1,2922 @@
+# Author: David Goodger
+# Contact: goodger at users.sourceforge.net
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This module has been placed in the public domain.
+
+"""
+This is the ``docutils.parsers.restructuredtext.states`` module, the core of
+the reStructuredText parser. It defines the following:
+
+:Classes:
+ - `RSTStateMachine`: reStructuredText parser's entry point.
+ - `NestedStateMachine`: recursive StateMachine.
+ - `RSTState`: reStructuredText State superclass.
+ - `Inliner`: For parsing inline markup.
+ - `Body`: Generic classifier of the first line of a block.
+ - `SpecializedBody`: Superclass for compound element members.
+ - `BulletList`: Second and subsequent bullet_list list_items
+ - `DefinitionList`: Second+ definition_list_items.
+ - `EnumeratedList`: Second+ enumerated_list list_items.
+ - `FieldList`: Second+ fields.
+ - `OptionList`: Second+ option_list_items.
+ - `RFC2822List`: Second+ RFC2822-style fields.
+ - `ExtensionOptions`: Parses directive option fields.
+ - `Explicit`: Second+ explicit markup constructs.
+ - `SubstitutionDef`: For embedded directives in substitution definitions.
+ - `Text`: Classifier of second line of a text block.
+ - `SpecializedText`: Superclass for continuation lines of Text-variants.
+ - `Definition`: Second line of potential definition_list_item.
+ - `Line`: Second line of overlined section title or transition marker.
+ - `Struct`: An auxiliary collection class.
+
+:Exception classes:
+ - `MarkupError`
+ - `ParserError`
+ - `MarkupMismatch`
+
+:Functions:
+ - `escape2null()`: Return a string, escape-backslashes converted to nulls.
+ - `unescape()`: Return a string, nulls removed or restored to backslashes.
+
+:Attributes:
+ - `state_classes`: set of State classes used with `RSTStateMachine`.
+
+Parser Overview
+===============
+
+The reStructuredText parser is implemented as a recursive state machine,
+examining its input one line at a time. To understand how the parser works,
+please first become familiar with the `docutils.statemachine` module. In the
+description below, references are made to classes defined in this module;
+please see the individual classes for details.
+
+Parsing proceeds as follows:
+
+1. The state machine examines each line of input, checking each of the
+ transition patterns of the state `Body`, in order, looking for a match.
+ The implicit transitions (blank lines and indentation) are checked before
+ any others. The 'text' transition is a catch-all (matches anything).
+
+2. The method associated with the matched transition pattern is called.
+
+ A. Some transition methods are self-contained, appending elements to the
+ document tree (`Body.doctest` parses a doctest block). The parser's
+ current line index is advanced to the end of the element, and parsing
+ continues with step 1.
+
+ B. Other transition methods trigger the creation of a nested state machine,
+ whose job is to parse a compound construct ('indent' does a block quote,
+ 'bullet' does a bullet list, 'overline' does a section [first checking
+ for a valid section header], etc.).
+
+ - In the case of lists and explicit markup, a one-off state machine is
+ created and run to parse contents of the first item.
+
+ - A new state machine is created and its initial state is set to the
+ appropriate specialized state (`BulletList` in the case of the
+ 'bullet' transition; see `SpecializedBody` for more detail). This
+ state machine is run to parse the compound element (or series of
+ explicit markup elements), and returns as soon as a non-member element
+ is encountered. For example, the `BulletList` state machine ends as
+ soon as it encounters an element which is not a list item of that
+ bullet list. The optional omission of inter-element blank lines is
+ enabled by this nested state machine.
+
+ - The current line index is advanced to the end of the elements parsed,
+ and parsing continues with step 1.
+
+ C. The result of the 'text' transition depends on the next line of text.
+ The current state is changed to `Text`, under which the second line is
+ examined. If the second line is:
+
+ - Indented: The element is a definition list item, and parsing proceeds
+ similarly to step 2.B, using the `DefinitionList` state.
+
+ - A line of uniform punctuation characters: The element is a section
+ header; again, parsing proceeds as in step 2.B, and `Body` is still
+ used.
+
+ - Anything else: The element is a paragraph, which is examined for
+ inline markup and appended to the parent element. Processing
+ continues with step 1.
+"""
+
+__docformat__ = 'reStructuredText'
+
+
+import sys
+import re
+from docutils import roman
+from types import TupleType
+from docutils import nodes, statemachine, utils, urischemes
+from docutils import ApplicationError, DataError
+from docutils.statemachine import StateMachineWS, StateWS
+from docutils.nodes import fully_normalize_name as normalize_name
+from docutils.parsers.rst import directives, languages, tableparser
+from docutils.parsers.rst.languages import en as _fallback_language_module
+
+
+class MarkupError(DataError): pass
+class UnknownInterpretedRoleError(DataError): pass
+class InterpretedRoleNotImplementedError(DataError): pass
+class ParserError(ApplicationError): pass
+class MarkupMismatch(Exception): pass
+
+
+class Struct:
+
+ """Stores data attributes for dotted-attribute access."""
+
+ def __init__(self, **keywordargs):
+ self.__dict__.update(keywordargs)
+
+
+class RSTStateMachine(StateMachineWS):
+
+ """
+ reStructuredText's master StateMachine.
+
+ The entry point to reStructuredText parsing is the `run()` method.
+ """
+
+ def run(self, input_lines, document, input_offset=0, match_titles=1,
+ inliner=None):
+ """
+ Parse `input_lines` and return a `docutils.nodes.document` instance.
+
+ Extend `StateMachineWS.run()`: set up parse-global data, run the
+ StateMachine, and return the resulting
+ document.
+ """
+ self.language = languages.get_language(
+ document.settings.language_code)
+ self.match_titles = match_titles
+ if inliner is None:
+ inliner = Inliner()
+ inliner.init_customizations(document.settings)
+ self.memo = Struct(document=document,
+ reporter=document.reporter,
+ language=self.language,
+ title_styles=[],
+ section_level=0,
+ section_bubble_up_kludge=0,
+ inliner=inliner)
+ self.document = document
+ self.attach_observer(document.note_source)
+ self.reporter = self.memo.reporter
+ self.node = document
+ results = StateMachineWS.run(self, input_lines, input_offset,
+ input_source=document['source'])
+ assert results == [], 'RSTStateMachine.run() results should be empty!'
+ self.check_document()
+ self.node = self.memo = None # remove unneeded references
+
+ def check_document(self):
+ """Check for illegal structure: empty document."""
+ if len(self.document) == 0:
+ error = self.reporter.error(
+ 'Document empty; must have contents.', line=0)
+ self.document += error
+
+
+class NestedStateMachine(StateMachineWS):
+
+ """
+ StateMachine run from within other StateMachine runs, to parse nested
+ document structures.
+ """
+
+ def run(self, input_lines, input_offset, memo, node, match_titles=1):
+ """
+ Parse `input_lines` and populate a `docutils.nodes.document` instance.
+
+ Extend `StateMachineWS.run()`: set up document-wide data.
+ """
+ self.match_titles = match_titles
+ self.memo = memo
[-=- -=- -=- 2525 lines omitted -=- -=- -=-]
+ All transition methods are disabled. Override individual methods in
+ subclasses to re-enable.
+ """
+
+ def eof(self, context):
+ """Incomplete construct."""
+ return []
+
+ def invalid_input(self, match=None, context=None, next_state=None):
+ """Not a compound element member. Abort this state machine."""
+ raise EOFError
+
+ blank = invalid_input
+ indent = invalid_input
+ underline = invalid_input
+ text = invalid_input
+
+
+class Definition(SpecializedText):
+
+ """Second line of potential definition_list_item."""
+
+ def eof(self, context):
+ """Not a definition."""
+ self.state_machine.previous_line(2) # so parent SM can reassess
+ return []
+
+ def indent(self, match, context, next_state):
+ """Definition list item."""
+ definitionlistitem, blank_finish = self.definition_list_item(context)
+ self.parent += definitionlistitem
+ self.blank_finish = blank_finish
+ return [], 'DefinitionList', []
+
+
+class Line(SpecializedText):
+
+ """
+ Second line of over- & underlined section title or transition marker.
+ """
+
+ eofcheck = 1 # @@@ ???
+ """Set to 0 while parsing sections, so that we don't catch the EOF."""
+
+ def eof(self, context):
+ """Transition marker at end of section or document."""
+ marker = context[0].strip()
+ if self.memo.section_bubble_up_kludge:
+ self.memo.section_bubble_up_kludge = 0
+ elif len(marker) < 4:
+ self.state_correction(context)
+ if self.eofcheck: # ignore EOFError with sections
+ lineno = self.state_machine.abs_line_number() - 1
+ transition = nodes.transition(context[0])
+ transition.line = lineno
+ self.parent += transition
+ msg = self.reporter.error(
+ 'Document or section may not end with a transition.',
+ line=lineno)
+ self.parent += msg
+ self.eofcheck = 1
+ return []
+
+ def blank(self, match, context, next_state):
+ """Transition marker."""
+ lineno = self.state_machine.abs_line_number() - 1
+ marker = context[0].strip()
+ if len(marker) < 4:
+ self.state_correction(context)
+ transition = nodes.transition(marker)
+ transition.line = lineno
+ if len(self.parent) == 0:
+ msg = self.reporter.error(
+ 'Document or section may not begin with a transition.',
+ line=lineno)
+ self.parent += msg
+ elif isinstance(self.parent[-1], nodes.transition):
+ msg = self.reporter.error(
+ 'At least one body element must separate transitions; '
+ 'adjacent transitions not allowed.',
+ line=lineno)
+ self.parent += msg
+ self.parent += transition
+ return [], 'Body', []
+
+ def text(self, match, context, next_state):
+ """Potential over- & underlined title."""
+ lineno = self.state_machine.abs_line_number() - 1
+ overline = context[0]
+ title = match.string
+ underline = ''
+ try:
+ underline = self.state_machine.next_line()
+ except EOFError:
+ blocktext = overline + '\n' + title
+ if len(overline.rstrip()) < 4:
+ self.short_overline(context, blocktext, lineno, 2)
+ else:
+ msg = self.reporter.severe(
+ 'Incomplete section title.',
+ nodes.literal_block(blocktext, blocktext), line=lineno)
+ self.parent += msg
+ return [], 'Body', []
+ source = '%s\n%s\n%s' % (overline, title, underline)
+ overline = overline.rstrip()
+ underline = underline.rstrip()
+ if not self.transitions['underline'][0].match(underline):
+ blocktext = overline + '\n' + title + '\n' + underline
+ if len(overline.rstrip()) < 4:
+ self.short_overline(context, blocktext, lineno, 2)
+ else:
+ msg = self.reporter.severe(
+ 'Missing matching underline for section title overline.',
+ nodes.literal_block(source, source), line=lineno)
+ self.parent += msg
+ return [], 'Body', []
+ elif overline != underline:
+ blocktext = overline + '\n' + title + '\n' + underline
+ if len(overline.rstrip()) < 4:
+ self.short_overline(context, blocktext, lineno, 2)
+ else:
+ msg = self.reporter.severe(
+ 'Title overline & underline mismatch.',
+ nodes.literal_block(source, source), line=lineno)
+ self.parent += msg
+ return [], 'Body', []
+ title = title.rstrip()
+ messages = []
+ if len(title) > len(overline):
+ blocktext = overline + '\n' + title + '\n' + underline
+ if len(overline.rstrip()) < 4:
+ self.short_overline(context, blocktext, lineno, 2)
+ else:
+ msg = self.reporter.warning(
+ 'Title overline too short.',
+ nodes.literal_block(source, source), line=lineno)
+ messages.append(msg)
+ style = (overline[0], underline[0])
+ self.eofcheck = 0 # @@@ not sure this is correct
+ self.section(title.lstrip(), source, style, lineno + 1, messages)
+ self.eofcheck = 1
+ return [], 'Body', []
+
+ indent = text # indented title
+
+ def underline(self, match, context, next_state):
+ overline = context[0]
+ blocktext = overline + '\n' + self.state_machine.line
+ lineno = self.state_machine.abs_line_number() - 1
+ if len(overline.rstrip()) < 4:
+ self.short_overline(context, blocktext, lineno, 1)
+ msg = self.reporter.error(
+ 'Invalid section title or transition marker.',
+ nodes.literal_block(blocktext, blocktext), line=lineno)
+ self.parent += msg
+ return [], 'Body', []
+
+ def short_overline(self, context, blocktext, lineno, lines=1):
+ msg = self.reporter.info(
+ 'Possible incomplete section title.\nTreating the overline as '
+ "ordinary text because it's so short.", line=lineno)
+ self.parent += msg
+ self.state_correction(context, lines)
+
+ def state_correction(self, context, lines=1):
+ self.state_machine.previous_line(lines)
+ context[:] = []
+ raise statemachine.StateCorrection('Body', 'text')
+
+
+state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
+ OptionList, ExtensionOptions, Explicit, Text, Definition,
+ Line, SubstitutionDef, RFC2822Body, RFC2822List)
+"""Standard set of State classes used to start `RSTStateMachine`."""
+
+
+def escape2null(text):
+ """Return a string with escape-backslashes converted to nulls."""
+ parts = []
+ start = 0
+ while 1:
+ found = text.find('\\', start)
+ if found == -1:
+ parts.append(text[start:])
+ return ''.join(parts)
+ parts.append(text[start:found])
+ parts.append('\x00' + text[found+1:found+2])
+ start = found + 2 # skip character after escape
+
+def unescape(text, restore_backslashes=0):
+ """
+ Return a string with nulls removed or restored to backslashes.
+ Backslash-escaped spaces are also removed.
+ """
+ if restore_backslashes:
+ return text.replace('\x00', '\\')
+ else:
+ for sep in ['\x00 ', '\x00\n', '\x00']:
+ text = ''.join(text.split(sep))
+ return text
=== Zope/lib/python/docutils/parsers/rst/tableparser.py 1.4 => 1.5 ===
--- /dev/null Sun Nov 30 10:07:47 2003
+++ Zope/lib/python/docutils/parsers/rst/tableparser.py Sun Nov 30 10:07:46 2003
@@ -0,0 +1,522 @@
+# Author: David Goodger
+# Contact: goodger at users.sourceforge.net
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This module has been placed in the public domain.
+
+"""
+This module defines table parser classes,which parse plaintext-graphic tables
+and produce a well-formed data structure suitable for building a CALS table.
+
+:Classes:
+ - `GridTableParser`: Parse fully-formed tables represented with a grid.
+ - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
+ borders.
+
+:Exception class: `TableMarkupError`
+
+:Function:
+ `update_dict_of_lists()`: Merge two dictionaries containing list values.
+"""
+
+__docformat__ = 'reStructuredText'
+
+
+import re
+import sys
+from docutils import DataError
+
+
+class TableMarkupError(DataError): pass
+
+
+class TableParser:
+
+ """
+ Abstract superclass for the common parts of the syntax-specific parsers.
+ """
+
+ head_body_separator_pat = None
+ """Matches the row separator between head rows and body rows."""
+
+ def parse(self, block):
+ """
+ Analyze the text `block` and return a table data structure.
+
+ Given a plaintext-graphic table in `block` (list of lines of text; no
+ whitespace padding), parse the table, construct and return the data
+ necessary to construct a CALS table or equivalent.
+
+ Raise `TableMarkupError` if there is any problem with the markup.
+ """
+ self.setup(block)
+ self.find_head_body_sep()
+ self.parse_table()
+ structure = self.structure_from_cells()
+ return structure
+
+ def find_head_body_sep(self):
+ """Look for a head/body row separator line; store the line index."""
+ for i in range(len(self.block)):
+ line = self.block[i]
+ if self.head_body_separator_pat.match(line):
+ if self.head_body_sep:
+ raise TableMarkupError(
+ 'Multiple head/body row separators in table (at line '
+ 'offset %s and %s); only one allowed.'
+ % (self.head_body_sep, i))
+ else:
+ self.head_body_sep = i
+ self.block[i] = line.replace('=', '-')
+ if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
+ - 1):
+ raise TableMarkupError('The head/body row separator may not be '
+ 'the first or last line of the table.')
+
+
+class GridTableParser(TableParser):
+
+ """
+ Parse a grid table using `parse()`.
+
+ Here's an example of a grid table::
+
+ +------------------------+------------+----------+----------+
+ | Header row, column 1 | Header 2 | Header 3 | Header 4 |
+ +========================+============+==========+==========+
+ | body row 1, column 1 | column 2 | column 3 | column 4 |
+ +------------------------+------------+----------+----------+
+ | body row 2 | Cells may span columns. |
+ +------------------------+------------+---------------------+
+ | body row 3 | Cells may | - Table cells |
+ +------------------------+ span rows. | - contain |
+ | body row 4 | | - body elements. |
+ +------------------------+------------+---------------------+
+
+ Intersections use '+', row separators use '-' (except for one optional
+ head/body row separator, which uses '='), and column separators use '|'.
+
+ Passing the above table to the `parse()` method will result in the
+ following data structure::
+
+ ([24, 12, 10, 10],
+ [[(0, 0, 1, ['Header row, column 1']),
+ (0, 0, 1, ['Header 2']),
+ (0, 0, 1, ['Header 3']),
+ (0, 0, 1, ['Header 4'])]],
+ [[(0, 0, 3, ['body row 1, column 1']),
+ (0, 0, 3, ['column 2']),
+ (0, 0, 3, ['column 3']),
+ (0, 0, 3, ['column 4'])],
+ [(0, 0, 5, ['body row 2']),
+ (0, 2, 5, ['Cells may span columns.']),
+ None,
+ None],
+ [(0, 0, 7, ['body row 3']),
+ (1, 0, 7, ['Cells may', 'span rows.', '']),
+ (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
+ None],
+ [(0, 0, 9, ['body row 4']), None, None, None]])
+
+ The first item is a list containing column widths (colspecs). The second
+ item is a list of head rows, and the third is a list of body rows. Each
+ row contains a list of cells. Each cell is either None (for a cell unused
+ because of another cell's span), or a tuple. A cell tuple contains four
+ items: the number of extra rows used by the cell in a vertical span
+ (morerows); the number of extra columns used by the cell in a horizontal
+ span (morecols); the line offset of the first line of the cell contents;
+ and the cell contents, a list of lines of text.
+ """
+
+ head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
+
+ def setup(self, block):
+ self.block = block[:] # make a copy; it may be modified
+ self.block.disconnect() # don't propagate changes to parent
+ self.bottom = len(block) - 1
+ self.right = len(block[0]) - 1
+ self.head_body_sep = None
+ self.done = [-1] * len(block[0])
+ self.cells = []
+ self.rowseps = {0: [0]}
+ self.colseps = {0: [0]}
+
+ def parse_table(self):
+ """
+ Start with a queue of upper-left corners, containing the upper-left
+ corner of the table itself. Trace out one rectangular cell, remember
+ it, and add its upper-right and lower-left corners to the queue of
+ potential upper-left corners of further cells. Process the queue in
+ top-to-bottom order, keeping track of how much of each text column has
+ been seen.
+
+ We'll end up knowing all the row and column boundaries, cell positions
+ and their dimensions.
+ """
+ corners = [(0, 0)]
+ while corners:
+ top, left = corners.pop(0)
+ if top == self.bottom or left == self.right \
+ or top <= self.done[left]:
+ continue
+ result = self.scan_cell(top, left)
+ if not result:
+ continue
+ bottom, right, rowseps, colseps = result
+ update_dict_of_lists(self.rowseps, rowseps)
+ update_dict_of_lists(self.colseps, colseps)
+ self.mark_done(top, left, bottom, right)
+ cellblock = self.block.get_2D_block(top + 1, left + 1,
+ bottom, right)
+ cellblock.disconnect() # lines in cell can't sync with parent
+ self.cells.append((top, left, bottom, right, cellblock))
+ corners.extend([(top, right), (bottom, left)])
+ corners.sort()
+ if not self.check_parse_complete():
+ raise TableMarkupError('Malformed table; parse incomplete.')
+
+ def mark_done(self, top, left, bottom, right):
+ """For keeping track of how much of each text column has been seen."""
+ before = top - 1
+ after = bottom - 1
+ for col in range(left, right):
+ assert self.done[col] == before
+ self.done[col] = after
+
+ def check_parse_complete(self):
+ """Each text column should have been completely seen."""
+ last = self.bottom - 1
+ for col in range(self.right):
+ if self.done[col] != last:
+ return None
+ return 1
+
+ def scan_cell(self, top, left):
+ """Starting at the top-left corner, start tracing out a cell."""
+ assert self.block[top][left] == '+'
+ result = self.scan_right(top, left)
+ return result
+
+ def scan_right(self, top, left):
+ """
+ Look for the top-right corner of the cell, and make note of all column
+ boundaries ('+').
+ """
+ colseps = {}
+ line = self.block[top]
+ for i in range(left + 1, self.right + 1):
+ if line[i] == '+':
+ colseps[i] = [top]
+ result = self.scan_down(top, left, i)
+ if result:
+ bottom, rowseps, newcolseps = result
+ update_dict_of_lists(colseps, newcolseps)
+ return bottom, i, rowseps, colseps
+ elif line[i] != '-':
+ return None
+ return None
+
+ def scan_down(self, top, left, right):
+ """
+ Look for the bottom-right corner of the cell, making note of all row
+ boundaries.
+ """
+ rowseps = {}
+ for i in range(top + 1, self.bottom + 1):
+ if self.block[i][right] == '+':
+ rowseps[i] = [right]
+ result = self.scan_left(top, left, i, right)
+ if result:
+ newrowseps, colseps = result
+ update_dict_of_lists(rowseps, newrowseps)
+ return i, rowseps, colseps
+ elif self.block[i][right] != '|':
+ return None
+ return None
+
+ def scan_left(self, top, left, bottom, right):
+ """
+ Noting column boundaries, look for the bottom-left corner of the cell.
+ It must line up with the starting point.
+ """
+ colseps = {}
+ line = self.block[bottom]
+ for i in range(right - 1, left, -1):
+ if line[i] == '+':
+ colseps[i] = [bottom]
+ elif line[i] != '-':
+ return None
+ if line[left] != '+':
+ return None
+ result = self.scan_up(top, left, bottom, right)
+ if result is not None:
+ rowseps = result
+ return rowseps, colseps
+ return None
+
+ def scan_up(self, top, left, bottom, right):
+ """
+ Noting row boundaries, see if we can return to the starting point.
+ """
+ rowseps = {}
+ for i in range(bottom - 1, top, -1):
+ if self.block[i][left] == '+':
+ rowseps[i] = [left]
+ elif self.block[i][left] != '|':
+ return None
+ return rowseps
+
+ def structure_from_cells(self):
+ """
+ From the data collected by `scan_cell()`, convert to the final data
+ structure.
+ """
+ rowseps = self.rowseps.keys() # list of row boundaries
+ rowseps.sort()
+ rowindex = {}
+ for i in range(len(rowseps)):
+ rowindex[rowseps[i]] = i # row boundary -> row number mapping
+ colseps = self.colseps.keys() # list of column boundaries
+ colseps.sort()
+ colindex = {}
+ for i in range(len(colseps)):
+ colindex[colseps[i]] = i # column boundary -> col number map
+ colspecs = [(colseps[i] - colseps[i - 1] - 1)
+ for i in range(1, len(colseps))] # list of column widths
+ # prepare an empty table with the correct number of rows & columns
+ onerow = [None for i in range(len(colseps) - 1)]
+ rows = [onerow[:] for i in range(len(rowseps) - 1)]
+ # keep track of # of cells remaining; should reduce to zero
+ remaining = (len(rowseps) - 1) * (len(colseps) - 1)
+ for top, left, bottom, right, block in self.cells:
+ rownum = rowindex[top]
+ colnum = colindex[left]
+ assert rows[rownum][colnum] is None, (
+ 'Cell (row %s, column %s) already used.'
+ % (rownum + 1, colnum + 1))
+ morerows = rowindex[bottom] - rownum - 1
+ morecols = colindex[right] - colnum - 1
+ remaining -= (morerows + 1) * (morecols + 1)
+ # write the cell into the table
+ rows[rownum][colnum] = (morerows, morecols, top + 1, block)
+ assert remaining == 0, 'Unused cells remaining.'
+ if self.head_body_sep: # separate head rows from body rows
+ numheadrows = rowindex[self.head_body_sep]
+ headrows = rows[:numheadrows]
+ bodyrows = rows[numheadrows:]
+ else:
+ headrows = []
+ bodyrows = rows
+ return (colspecs, headrows, bodyrows)
+
+
+class SimpleTableParser(TableParser):
+
+ """
+ Parse a simple table using `parse()`.
+
+ Here's an example of a simple table::
+
+ ===== =====
+ col 1 col 2
+ ===== =====
+ 1 Second column of row 1.
+ 2 Second column of row 2.
+ Second line of paragraph.
+ 3 - Second column of row 3.
+
+ - Second item in bullet
+ list (row 3, column 2).
+ 4 is a span
+ ------------
+ 5
+ ===== =====
+
+ Top and bottom borders use '=', column span underlines use '-', column
+ separation is indicated with spaces.
+
+ Passing the above table to the `parse()` method will result in the
+ following data structure, whose interpretation is the same as for
+ `GridTableParser`::
+
+ ([5, 25],
+ [[(0, 0, 1, ['col 1']),
+ (0, 0, 1, ['col 2'])]],
+ [[(0, 0, 3, ['1']),
+ (0, 0, 3, ['Second column of row 1.'])],
+ [(0, 0, 4, ['2']),
+ (0, 0, 4, ['Second column of row 2.',
+ 'Second line of paragraph.'])],
+ [(0, 0, 6, ['3']),
+ (0, 0, 6, ['- Second column of row 3.',
+ '',
+ '- Second item in bullet',
+ ' list (row 3, column 2).'])],
+ [(0, 1, 10, ['4 is a span'])],
+ [(0, 0, 12, ['5']),
+ (0, 0, 12, [''])]])
+ """
+
+ head_body_separator_pat = re.compile('=[ =]*$')
+ span_pat = re.compile('-[ -]*$')
+
+ def setup(self, block):
+ self.block = block[:] # make a copy; it will be modified
+ self.block.disconnect() # don't propagate changes to parent
+ # Convert top & bottom borders to column span underlines:
+ self.block[0] = self.block[0].replace('=', '-')
+ self.block[-1] = self.block[-1].replace('=', '-')
+ self.head_body_sep = None
+ self.columns = []
+ self.border_end = None
+ self.table = []
+ self.done = [-1] * len(block[0])
+ self.rowseps = {0: [0]}
+ self.colseps = {0: [0]}
+
+ def parse_table(self):
+ """
+ First determine the column boundaries from the top border, then
+ process rows. Each row may consist of multiple lines; accumulate
+ lines until a row is complete. Call `self.parse_row` to finish the
+ job.
+ """
+ # Top border must fully describe all table columns.
+ self.columns = self.parse_columns(self.block[0], 0)
+ self.border_end = self.columns[-1][1]
+ firststart, firstend = self.columns[0]
+ offset = 1 # skip top border
+ start = 1
+ text_found = None
+ while offset < len(self.block):
+ line = self.block[offset]
+ if self.span_pat.match(line):
+ # Column span underline or border; row is complete.
+ self.parse_row(self.block[start:offset], start,
+ (line.rstrip(), offset))
+ start = offset + 1
+ text_found = None
+ elif line[firststart:firstend].strip():
+ # First column not blank, therefore it's a new row.
+ if text_found and offset != start:
+ self.parse_row(self.block[start:offset], start)
+ start = offset
+ text_found = 1
+ elif not text_found:
+ start = offset + 1
+ offset += 1
+
+ def parse_columns(self, line, offset):
+ """
+ Given a column span underline, return a list of (begin, end) pairs.
+ """
+ cols = []
+ end = 0
+ while 1:
+ begin = line.find('-', end)
+ end = line.find(' ', begin)
+ if begin < 0:
+ break
+ if end < 0:
+ end = len(line)
+ cols.append((begin, end))
+ if self.columns:
+ if cols[-1][1] != self.border_end:
+ raise TableMarkupError('Column span incomplete at line '
+ 'offset %s.' % offset)
+ # Allow for an unbounded rightmost column:
+ cols[-1] = (cols[-1][0], self.columns[-1][1])
+ return cols
+
+ def init_row(self, colspec, offset):
+ i = 0
+ cells = []
+ for start, end in colspec:
+ morecols = 0
+ try:
+ assert start == self.columns[i][0]
+ while end != self.columns[i][1]:
+ i += 1
+ morecols += 1
+ except (AssertionError, IndexError):
+ raise TableMarkupError('Column span alignment problem at '
+ 'line offset %s.' % (offset + 1))
+ cells.append([0, morecols, offset, []])
+ i += 1
+ return cells
+
+ def parse_row(self, lines, start, spanline=None):
+ """
+ Given the text `lines` of a row, parse it and append to `self.table`.
+
+ The row is parsed according to the current column spec (either
+ `spanline` if provided or `self.columns`). For each column, extract
+ text from each line, and check for text in column margins. Finally,
+ adjust for insigificant whitespace.
+ """
+ if not (lines or spanline):
+ # No new row, just blank lines.
+ return
+ if spanline:
+ columns = self.parse_columns(*spanline)
+ span_offset = spanline[1]
+ else:
+ columns = self.columns[:]
+ span_offset = start
+ self.check_columns(lines, start, columns)
+ row = self.init_row(columns, start)
+ for i in range(len(columns)):
+ start, end = columns[i]
+ cellblock = lines.get_2D_block(0, start, len(lines), end)
+ cellblock.disconnect() # lines in cell can't sync with parent
+ row[i][3] = cellblock
+ self.table.append(row)
+
+ def check_columns(self, lines, first_line, columns):
+ """
+ Check for text in column margins and text overflow in the last column.
+ Raise TableMarkupError if anything but whitespace is in column margins.
+ Adjust the end value for the last column if there is text overflow.
+ """
+ # "Infinite" value for a dummy last column's beginning, used to
+ # check for text overflow:
+ columns.append((sys.maxint, None))
+ lastcol = len(columns) - 2
+ for i in range(len(columns) - 1):
+ start, end = columns[i]
+ nextstart = columns[i+1][0]
+ offset = 0
+ for line in lines:
+ if i == lastcol and line[end:].strip():
+ text = line[start:].rstrip()
+ new_end = start + len(text)
+ columns[i] = (start, new_end)
+ main_start, main_end = self.columns[-1]
+ if new_end > main_end:
+ self.columns[-1] = (main_start, new_end)
+ elif line[end:nextstart].strip():
+ raise TableMarkupError('Text in column margin at line '
+ 'offset %s.' % (first_line + offset))
+ offset += 1
+ columns.pop()
+
+ def structure_from_cells(self):
+ colspecs = [end - start for start, end in self.columns]
+ first_body_row = 0
+ if self.head_body_sep:
+ for i in range(len(self.table)):
+ if self.table[i][0][2] > self.head_body_sep:
+ first_body_row = i
+ break
+ return (colspecs, self.table[:first_body_row],
+ self.table[first_body_row:])
+
+
+def update_dict_of_lists(master, newdata):
+ """
+ Extend the list values of `master` with those from `newdata`.
+
+ Both parameters must be dictionaries containing list values.
+ """
+ for key, values in newdata.items():
+ master.setdefault(key, []).extend(values)
More information about the Zope-Checkins
mailing list