[Zope3-checkins] CVS: zopeproducts/xml/dom - printer.py:1.1
Philipp von Weitershausen
philikon@philikon.de
Fri, 20 Jun 2003 19:40:40 -0400
Update of /cvs-repository/zopeproducts/xml/dom
In directory cvs.zope.org:/tmp/cvs-serv13468/dom
Added Files:
printer.py
Log Message:
Ported ParsedXML's PrettyPrinter over, now with interface and unit tests of
course.
=== Added File zopeproducts/xml/dom/printer.py ===
##############################################################################
#
# Copyright (c) 2003 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""
XML DOM printer
$Id: printer.py,v 1.1 2003/06/20 23:40:39 philikon Exp $
"""
import sys
import re
import string
from StringIO import StringIO
from zope.interface import implements
from zopeproducts.xml.interfaces.dom.printer import IPrinter
from core import Node, XMLNS_NS, XML_NS
class Printer:
"""
A XML printer which can do optional pretty printing
"""
implements(IPrinter)
def __init__(self, encoding=None, html=False, contentType=None,
entityReferenceExpansion=True, prettyPrint=False,
indentLevel=2):
self.namePrint = lambda s: s # identity
if contentType and html:
if contentType == 'html':
self.namePrint = string.upper
elif contentType == 'xml':
self.namePrint = string.lower
self.encoding = encoding
self.html = html
self.contentType = contentType
self.entityReferenceExpansion = entityReferenceExpansion
self.prettyPrint = prettyPrint
self.indent = 0
self.indentLevel = indentLevel
self.nodeType2method = {
Node.ELEMENT_NODE: self.renderElement,
Node.ATTRIBUTE_NODE: self.renderAttr,
Node.TEXT_NODE: self.renderText,
Node.CDATA_SECTION_NODE: self.renderCDATASection,
Node.ENTITY_REFERENCE_NODE: self.renderEntityReference,
Node.ENTITY_NODE: self.renderEntity,
Node.PROCESSING_INSTRUCTION_NODE: \
self.renderProcessingInstruction,
Node.COMMENT_NODE: self.renderComment,
Node.DOCUMENT_NODE: self.renderDocument,
Node.DOCUMENT_TYPE_NODE: self.renderDocumentType,
Node.DOCUMENT_FRAGMENT_NODE: self.renderDocumentFragment,
Node.NOTATION_NODE: self.renderNotation,
}
def render(self, stream, node):
self.nodeType2method[node.nodeType](stream, node)
def renderElement(self, f, node):
if self.prettyPrint:
f.write(" " * self.indent * self.indentLevel)
f.write("<")
f.write(self.namePrint(node.tagName))
for attribute in node.attributes.values():
self.renderAttr(f, attribute)
if not node.hasChildNodes():
if self.html:
if node.tagName.upper() not in HTML_FORBIDDEN_END:
f.write('></')
f.write(self.namePrint(node.tagName))
f.write('>')
else:
f.write(' />')
else:
f.write('/>')
if self.prettyPrint:
f.write("\n")
else:
f.write('>')
prettyPrint = self.prettyPrint
stream = f
if prettyPrint:
f.write("\n")
no_indentation = False
for child in node.childNodes:
if (child.nodeType == Node.TEXT_NODE and
child.data.strip() != ''):
no_indentation = True
break
if no_indentation:
stream = StringIO()
self.prettyPrint = False
self.indent += 1
for child in node.childNodes:
self.render(stream, child)
self.prettyPrint = prettyPrint
if prettyPrint:
if no_indentation:
f.write(indentBlock(
stream.getvalue().strip(),
self.indent * self.indentLevel, 70))
f.write('\n')
self.indent -= 1
f.write(" " * self.indent * self.indentLevel)
f.write("</%s>" % self.namePrint(node.tagName))
if self.prettyPrint:
f.write("\n")
def renderAttr(self, f, node):
if not node.specified:
return
text, delimiter = _translateCdataAttr(node.value,
encoding=self.encoding)
f.write(" %s=%s%s%s" % (self.namePrint(node.name),
delimiter, text, delimiter))
def renderText(self, f, node):
data = node.data
if self.prettyPrint:
data = node.data.strip()
if data == "":
return
data = indentBlock(data, self.indent * self.indentLevel, 70)
f.write(_translateCdata(data, self.encoding))
if self.prettyPrint:
f.write('\n')
def renderCDATASection(self, f, node):
f.write("<![CDATA[")
f.write(node.data.replace("]]>", "]]]><![CDATA[]>"))
f.write("]]>")
def renderEntityReference(self, f, node):
f.write('&')
f.write(node.nodeName)
f.write(';')
def renderEntity(self, f, node):
st = "<!ENTITY " + node.nodeName
if not node.systemId:
# internal entity
s = node.firstChild.data
st = '%s "%s"' % (st, _translateCdata(s, self.encoding))
if node.publicId:
st = st + ' PUBLIC "%s"' % node.publicId
if node.systemId:
st = '%s "%s"' % (st, node.systemId)
elif node.systemId:
st = st + ' SYSTEM "%s"' % node.systemId
if node.notationName:
st = st + ' NDATA %s' % node.notationName
f.write(st + '>\n')
def renderProcessingInstruction(self, f, node):
f.write('<?')
f.write(node.target + ' ')
f.write(node.data)
f.write('?>')
def renderComment(self, f, node):
f.write('<!--')
f.write(node.data)
f.write('-->')
def renderDocument(self, f, node):
if not self.html:
f.write('<?xml version="1.0"')
if self.encoding:
f.write(' encoding="%s"' % self.encoding)
f.write(' ?>\n')
for child in node.childNodes:
self.render(f, child)
f.write('\n')
def renderDocumentType(self, f, node):
if (not node.entities.length and
not node.notations.length and
not node.systemId):
return
f.write("<!DOCTYPE ")
f.write(node.name)
if node.systemId:
if node.publicId:
if '"' not in node.publicId:
f.write(' PUBLIC "' + node.publicId + '" ')
else:
f.write(" PUBLIC '" + node.publicId + "' ")
else:
f.write(' SYSTEM ')
if '"' not in node.systemId:
f.write('"' + node.systemId + '"')
else:
f.write("'" + node.systemId + "'")
if node.internalSubset:
f.write(" [%s]" % node.internalSubset)
elif node.entities.length or node.notations.length:
f.write(' [\n')
for i in range(node.entities.length):
self.render(f, node.entities.item(i))
for i in range(node.notations.length):
self.render(f, node.notations.item(i))
f.write(']')
f.write('>\n')
def renderNotation(self, f, node):
st = "<!NOTATION %s" % node.nodeName
if node.publicId:
st = st + ' PUBLIC "%s"' % node.publicId
if node.systemId:
st = '%s "%s"' % (st, node.systemId)
elif node.systemId:
st = st + ' SYSTEM "%s"' % node.systemId
f.write(st + '>\n')
def renderDocumentFragment(self, f, node):
for child in node.childNodes:
self.render(f, child)
# regexps used by _translateCdata(),
# made global to compile once.
# see http://www.xml.com/axml/target.html#dt-character
ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
SURROGATE_BLOCK = '[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]'
ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
# Note: Prolly fuzzy on this, but it looks as if characters from the
# surrogate block are allowed if in scalar form, which is encoded in UTF8 the
# same was as in surrogate block form
XML_ILLEGAL_CHAR_PATTERN = re.compile(
'%s|%s' % (ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
# the characters that we will want to turn into entrefs
# We must do so for &, <, and > following ]].
# The xml parser has more leeway, but we're not the parser.
# http://www.xml.com/axml/target.html#dt-chardata
# characters that we must *always* turn to entrefs:
g_cdataCharPatternReq = re.compile('[&<]|]]>')
g_charToEntityReq = {
'&': '&',
'<': '<',
']]>': ']]>',
}
# characters that we must turn to entrefs in attr values:
g_cdataCharPattern = re.compile('[&<>"\']|]]>')
g_charToEntity = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": ''',
']]>': ']]>',
}
# HTML nodes to always be minimzied, else never minimize
# from PyXML's xml.dom.html
# http://www.w3.org/TR/xhtml1/#guidelines
HTML_FORBIDDEN_END = ['AREA', 'BASE', 'BASEFONT', 'BR', 'COL', 'FRAME',
'HR', 'IMG', 'INPUT', 'ISINDEX', 'LINK', 'META', 'PARAM']
def _translateCdata(characters, allEntRefs = None, encoding='UTF-8'):
"""Translate characters into a legal format."""
if not characters:
return ''
if allEntRefs: # translate all chars to entrefs; for attr value
if g_cdataCharPattern.search(characters):
new_string = g_cdataCharPattern.subn(
lambda m, d=g_charToEntity: d[m.group()],
characters)[0]
else:
new_string = characters
else: # translate only required chars to entrefs
if g_cdataCharPatternReq.search(characters):
new_string = g_cdataCharPatternReq.subn(
lambda m, d=g_charToEntityReq: d[m.group()],
characters)[0]
else:
new_string = characters
if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
new_string = XML_ILLEGAL_CHAR_PATTERN.subn(
lambda m: '&#%i;' % ord(m.group()),
new_string)[0]
#new_string = utf8_to_code(new_string, encoding) # XXX ugh
return new_string
def _translateCdataAttr(characters, encoding='UTF-8'):
"""
Translate attribute value characters into a legal format;
return the value and the delimiter used.
"""
if not characters:
return '', '"'
if '"' not in characters or "'" in characters:
delimiter = '"'
new_chars = _translateCdata(characters, allEntRefs=True,
encoding=encoding)
new_chars = re.sub("'", "'", new_chars)
else:
delimiter = "'"
new_chars = _translateCdata(characters, allEntRefs=True,
encoding=encoding)
new_chars = re.sub(""", '"', new_chars)
return new_chars, delimiter
def indentBlock(text, indent, line_length):
words = text.split()
lines = []
i = 0
while i < len(words):
line = []
while i < len(words) and indent + len(" ".join(line)) < line_length:
line.append(words[i])
i += 1
if len(line) > 1 and indent + len(" ".join(line)) >= line_length:
i -= 1
line.pop()
lines.append(" " * indent + " ".join(line))
return '\n'.join(lines)