[Zope-CVS] CVS: Packages/HTMLStructure - Parser.py:1.1 Printer.py:1.1 Validator.py:1.1 __init__.py:1.1
Evan Simpson
evan@zope.com
Wed, 23 Jan 2002 10:57:57 -0500
Update of /cvs-repository/Packages/HTMLStructure
In directory cvs.zope.org:/tmp/cvs-serv29501
Added Files:
Parser.py Printer.py Validator.py __init__.py
Log Message:
Initial checkin.
=== Added File Packages/HTMLStructure/Parser.py ===
from mx.TextTools import *
# Utility functions
from bisect import bisect
def getLines(text):
def add(taglist,text,l,r,subtags):
taglist.append(r)
pt_lines = (
(None, AllNotIn, '\n', +1),
(add, Is + CallTag, '\n', MatchOk, -1),
)
return tag(text, pt_lines)[1]
class ParsedText:
def __init__(self, text, parseTables=None):
self.text = text
if parseTables is None:
parseTables = ParseTables()
self.parse = tag(text, parseTables.pt_html)
self.linemap = getLines(text)
def lineOfChar(self, index):
'''Convert a character index into a line number.'''
return bisect(self.linemap, index)
def elementText(self, element):
'''Get the text for a parse element.'''
return self.text[element[1]:element[2]]
def tagName(self, tag):
'''Get the name of a tag parse element.'''
return self.elementText(tag[3][0][3][0]).lower()
def tagType(self, tag, is_empty=None):
'''Get the open/close/empty type of a tag.
Return None for elements that are not tags, or not one of
these three types of tag.
'''
if tag[0] != 'tag':
return
sub = tag[3][0]
# The first sub-element of a tag is the type.
ttype = sub[0]
# We only care about opening, closing, and empty tags
if ttype == 'close':
return ttype
if ttype == 'open':
# Check whether an open tag is empty
if (sub[3][-1][0] == 'empty' or
(is_empty and is_empty(self.tagName(tag))) ):
return 'empty'
return ttype
# Parse HTML. Needs to be read from bottom to top.
hexadecimal_set = set(number + 'abcdefABCDEF')
namestart_set = set(alpha + ':_')
name_set = set(alphanumeric + '.-_:')
basic_ws_set = set(' \t\r\n')
opt_ws = (None, AllInSet, basic_ws_set, +1)
req_ws = (None, AllInSet, basic_ws_set)
class ParseTables:
def __init__(self, handle_map=None):
'''Transform the parse table tuples so that they are linked,
and insert specified handlers.
"handle_map" must be a dictionary with keys of the form
(tablename, tuplename). The values will replace the first
element of the named tuples, and cause them to be used as
callbacks if they are callable.
'''
if handle_map is None:
handle_map = {}
todo = {}
for k, v in self.__class__.__dict__.items():
if k.startswith('pt_'):
todo[k] = v
done = {}
for k in todo.keys():
self.process(k, todo, {}, done, handle_map)
setattr(self, k, done[k])
def process(self, current, todo, working, done, handle_map):
if done.get(current) is not None:
return
if working.get(current):
raise ValueError, ('Circular table reference found at %s'
% `current`)
working[current] = 1
new_tuples = []
for tup in todo[current]:
handler = handle_map.get((current, tup[0]))
extra = 0
if callable(handler):
extra = CallTag
if handler:
tup = (handler, tup[1] + extra) + tup[2:]
if (tup[1] & 0xff) in (Table, SubTable):
table = tup[2]
if isinstance(table, type('')):
self.process(tup[2], todo, working, done, handle_map)
tup = tup[:2] + (done[tup[2]],) + tup[3:]
new_tuples.append(tup)
done[current] = tuple(new_tuples)
working[current] = 0
pt_name = (
(None, IsInSet, namestart_set),
(None, AllInSet, name_set, +1),
)
pt_charref = (
# Required character entity ref start
(None, Is, '&'),
# Try numeric
(None, Is, '#', +4),
# Is numeric, try Hex
(None, IsIn, 'xX', +2),
('hex', AllInSet, hexadecimal_set, +3, +4),
# Is numeric, try Decimal
('digits', AllInSet, number_set, +2, +3),
# Try symbolic
('name', Table, 'pt_name', +1, +2),
# Note an error, but don't fail to match
('error', Skip, 0),
# Optional ';' at the end
(None, Is, ';', +1, MatchOk),
# If not explicitly terminated, reject name chars
('error', IsInSet, name_set, +1),
)
pt_attrvalue1 = (
# Attribute value in double quotes
('quote', Is, '"'),
# Stop at '&' or end of string
(None, AllNotIn, '&"', +1),
(None, Is + LookAhead, '&', +2),
# Handle an entity ref
('charref', Table, 'pt_charref', -2, -2),
(None, Is, '"'),
)
pt_attrvalue2 = (
# Attribute value in single quotes
('quote', Is, "'"),
# Stop at '&' or end of string
(None, AllNotIn, "&'", +1),
(None, Is + LookAhead, '&', +2),
# Handle an entity ref
('charref', Table, 'pt_charref', -2, -2),
(None, Is, "'"),
)
pt_attrvalue3 = (
# Bare attribute value
('quote', Skip, 0),
# Stop at '&' or end of string
(None, AllIn, alphanumeric + '-./:;+*%?!$()_#=~'),
(None, Is + LookAhead, '&', MatchOk),
# Handle an entity ref
('charref', Table, 'pt_charref', -2, -2),
)
pt_attribute1 = (
('name', Table, 'pt_name'),
opt_ws,
(None, Is, '='),
opt_ws,
# Try double, then single quoted value
('value', Table, 'pt_attrvalue1', +1, MatchOk),
('value', Table, 'pt_attrvalue2', +1, MatchOk),
('value', Table, 'pt_attrvalue3'),
)
pt_attribute2 = (
('name', Table, 'pt_name'),
)
pt_attributes = (
opt_ws, #should be req_ws, or complain if ws is missing
# Try attribute with value first, then plain name
('attr', Table, 'pt_attribute1', +1, +2),
('attr', Table, 'pt_attribute2'),
# Optional additional attributes
(None, SubTable, ThisTable, MatchOk),
(None, Jump, To, -1),
)
pt_open_tag = (
('name', Table, 'pt_name'),
('attrs', Table, 'pt_attributes', +1),
opt_ws,
# Empty tag?
('empty', Is, '/', +1),
# Close it.
(None, Is, '>')
)
pt_close_tag = (
# Required close tag start
(None, Is, '/'),
('name', Table, 'pt_name'),
opt_ws,
(None, Is, '>'),
)
pt_comment = (
# Required comment start
(None, Word, '!--'),
(None, AllNotIn, '-', +1),
(None, Is, '-'),
# After a hyphen, require a non-hyphen or the end
(None, IsNot, '-', +1, -2),
(None, Word, '->'),
)
pt_directive = (
# Required directive start
(None, Is, '!'),
('type', AllInSet, A2Z_set),
# Bogus and incomplete!
(None, WordEnd, '>'),
)
pt_PI = (
(None, Is, '?'),
('target', Table, 'pt_name'),
(None, WordEnd, '?>'),
)
pt_tag = (
# Required tag start
(None, Is, '<'),
# Try open tag
('open', Table, 'pt_open_tag', +1, MatchOk),
# Try close tag
('close', Table, 'pt_close_tag', +1, MatchOk),
# Try comment
('comment', Table, 'pt_comment', +1, MatchOk),
# Try directive
('directive', Table, 'pt_directive', +1, MatchOk),
# Finally, try PI
('PI', Table, 'pt_PI'),
)
pt_text = (
# grab plain text
(None, AllNotIn, '&<', +1),
# if we hit a tag, we're done.
(None, Is + LookAhead, '<', +1, MatchOk),
# We must have an '&', or be done.
(None, Is + LookAhead, '&', MatchOk),
('charref', Table, 'pt_charref', -3, -3)
)
# Main tag for HTML fragment parsing. The resulting parse structure
# highlights entity references, and all kinds of tags. Intervening
# text spans can easily be inferred.
pt_html = (
# grab text up to '<' or eof
(None, SubTable, 'pt_text', +1),
# If eof, we're done
(None, EOF, Here, +1, MatchOk),
# grab a tag and loop, or fail
('tag', Table, 'pt_tag', MatchFail, -2),
)
=== Added File Packages/HTMLStructure/Printer.py ===
from bisect import bisect
from cgi import escape
def printHTML(parsedtext, span_marks):
out = []
output = out.append
linelist = parsedtext.linemap
#linemap = {}
#for sm in span_marks:
# bline = bisect(linelist, sm[0])
# eline = bisect(linelist, sm[1])
# if sm[1] == linemap[eline]:
# eline -= 1
# linemap.setdefault(bline, [])
# linemap[bline].append(sm)
# if bline != eline:
# linemap.setdefault(eline, [])
# linemap[eline].append(sm)
output('<pre class="source">')
text = parsedtext.text
lend = 0
for n in range(len(linelist)):
output('<a name="%s" class="linenumber">%s</a>' % (n + 1, n + 1))
lbegin = lend
lend = linelist[n]
output(escape(text[lbegin:lend]))
output('</pre>')
return ''.join(out)
=== Added File Packages/HTMLStructure/Validator.py ===
from bisect import bisect
class Validator:
def __init__(self, parsedText):
self.pt = parsedText
self.is_empty = IS_EMPTY_HTML_TAG
self.impclose_enclosing_map = IMPCLOSE_ENCLOSING_MAP
self.impclose_following_map = IMPCLOSE_FOLLOWING_MAP
def entities(self):
'''Check all character entity references'''
all = []
errors = []
self._collect_entities(self.pt.parse[1], all)
for entity in all:
sub = entity[3]
if sub[-1][0] == 'error' or sub[0][2] - sub[0][1] > 32:
# Malformed or too long
errors.append(entity)
return all, errors
def _collect_entities(self, elements, entities):
for element in elements:
sub = element[3]
if element[0] == 'charref':
entities.append(element)
elif sub:
self._collect_entities(sub, entities)
def match_tags(self):
'''Attempt to pair up all tags properly
We scan the tags backwards, accumulating a stack of closing
tags and matching them with opening tags. When a pair of tags
matches exactly, the 2-tuple (open, close) of the parse
indexes of the matched tags are added to the "matched" list.
The parse indexes of unmatched tags are returned in the
"unclosed" and "unopened" lists.
'''
# We will return lists of matched and unmatched tags
matched = []
unclosed = []
unopened = []
parse = self.pt.parse[1]
tagName = self.pt.tagName
unop_names = []
# Scan non-empty tags (in reverse, as collect_tags returns them)
for (i, ttype) in self.collect_tags(empty=0):
tname = tagName(parse[i])
if ttype == 'close':
unopened.append(i)
unop_names.append(tname)
elif unopened and unop_names[-1] == tname:
matched.append((i, unopened.pop()))
unop_names.pop()
else:
unclosed.append(i)
# Return the lists in forward order
matched.reverse()
unopened.reverse()
unclosed.reverse()
return matched, unclosed, unopened
def collect_tags(self, opening=1, closing=1, empty=1):
'''Return a list of tags (in reverse order).
Elements of the list are (parse index, tag type) pairs.
Arguments to the method indicate which types of tag to include.
'''
parse = self.pt.parse[1]
tagName = self.pt.tagName
tagType = self.pt.tagType
is_empty = self.is_empty
octags = []
# Scan backwards
i = len(parse)
while i:
i = i - 1
ttype = tagType(parse[i], is_empty)
# Only bother with open/close/empty tags
if ttype is None:
continue
# Skip tag types that aren't asked for
if ((opening and ttype == 'open') or
(closing and ttype == 'close') or
(empty and ttype == 'empty')):
octags.append((i, ttype))
return octags
def implicit_match_tags(self, matched, unclosed):
'''Try to find implicit closing tags for unmatched opening tags.
This should be called with the output of the "match_tags"
method. For each unmatched opening tag that can be implicitly
closed, we scan forward for a tag capable of closing it.
An implicitly closable tag can only be closed if it has and
enclosing tag that is 'willing', or there is a subsequent tag
at the same nesting level
'''
# Save effort
if not unclosed:
return (), ()
parse = self.pt.parse[1]
tagName = self.pt.tagName
impclose_following_map = self.impclose_following_map
impclose_enclosing_map = self.impclose_enclosing_map
new_matched = []
still_unclosed = []
for pidx in unclosed:
tag = parse[pidx]
tname = tagName(tag)
enc_closers = impclose_enclosing_map.get(tname)
if enc_closers is None:
still_unclosed.append(pidx)
else:
tag_iter = Parallel_advance(matched, unclosed, pidx)
closers = impclose_following_map[tname]
while 1:
idx = tag_iter.next()
if idx is None: break
ftname = tagName(parse[idx])
if ftname in closers:
break
if idx is None:
# We didn't find a following closer
enc_tag = tag_iter.enc_tag
if enc_tag is None:
if "" in enc_closers:
# Implicit close at EOF
idx = -1
else:
etname = tagName(parse[enc_tag[0]])
if etname in enc_closers or "" in enc_closers:
# Implicit close at end of enclosing tag
idx = enc_tag[1]
if idx is None:
still_unclosed.append(pidx)
else:
new_matched.append((pidx, idx))
return new_matched, still_unclosed
class Parallel_advance:
stop = 0
enc_tag = None
def __init__(self, matched, unclosed, pidx):
#import pdb; pdb.set_trace()
self.matched = matched
self.unclosed = unclosed
mpos = bisect(matched, (pidx,))
while mpos and matched[mpos - 1][1] < pidx:
mpos -= 1
if mpos:
self.enc_tag = matched[mpos - 1]
self.stop = self.enc_tag[1]
self.scan = pidx + 1
def next_unclosed(self):
unclosed = self.unclosed
pos = bisect(unclosed, self.scan)
if pos and unclosed[pos - 1] == self.scan:
pos -= 1
if pos < len(unclosed):
idx = unclosed[pos]
if not self.stop or idx < self.stop:
return idx
def next_matched(self):
matched = self.matched
pos = bisect(matched, (self.scan,))
if pos < len(matched):
tag = matched[pos]
if not self.stop or tag[0] < self.stop:
return tag
def next(self):
uidx = self.next_unclosed()
mtag = self.next_matched()
if uidx is None and mtag is None:
return
if uidx is None or (mtag is not None and mtag[0] < uidx):
self.scan = mtag[1] + 1
return mtag[0]
elif mtag is None or uidx < mtag[0]:
self.scan = uidx + 1
return uidx
def nest_tags(matched):
'''Create a nested tag data structure from a list of pairs.'''
tags = []
stack = []
for span in matched:
tag = [span]
# Pop tags off of the stack that we're to the right of.
while stack:
top = stack[-1]
if span[0] > top[0][1]:
stack.pop()
else:
top.append(tag)
break
else:
tags.append(tag)
stack.append(tag)
return tags
def listTest(alist):
m = {}
for item in alist:
m[item] = 1
return m.has_key
EMPTY_HTML_TAGS = [
# List of HTML tags with an empty content model; these are
# rendered in minimized form, e.g. <img />.
# From http://www.w3.org/TR/xhtml1/#dtds
"base", "meta", "link", "hr", "br", "param", "img", "area",
"input", "col", "basefont", "isindex", "frame",
]
IS_EMPTY_HTML_TAG = listTest(EMPTY_HTML_TAGS)
PARA_LEVEL_HTML_TAGS = [
# List of HTML elements that close open paragraph-level elements
# and are themselves paragraph-level.
"h1", "h2", "h3", "h4", "h5", "h6", "p",
]
BLOCK_LEVEL_HTML_TAGS = [
# List of HTML tags that denote larger sections than paragraphs.
"blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
"noframe", "ul", "ol", "li", "dl", "dt", "dd", "div",
]
IS_PARA_OR_BLOCK_LEVEL = listTest(PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS)
IMPCLOSE_ENCLOSING_MAP = {
# Mapping of implicitly closable HTML elements to enclosing tags
# that are 'willing' to close them.
"p": BLOCK_LEVEL_HTML_TAGS + [""],
"tr": ("table", "thead", "tfoot", "tbody"),
"td": ("tr",),
"th": ("tr",),
"li": ("ul", "ol"),
"dd": ("dl",),
"dt": ("dl",),
}
IMPCLOSE_FOLLOWING_MAP = {
"p": PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS + [""],
"tr": ("tr", "thead", "tfoot", "tbody"),
"td": ("td", "th"),
"th": ("td", "th"),
"li": ("li",),
"dd": ("dd", "dt"),
"dt": ("dd", "dt"),
}
=== Added File Packages/HTMLStructure/__init__.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
'''HTMLStructure package
Contains utilities for parsing, validating, and reformatting HTML
text. These use mxTextTools for parsing, and follow its model of
tagged text spans. Unlike the standard HTMLParser, which is
event-driven, this package is data structure-oriented.
'''