[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters - ppt.py:1.1.2.1
Andreas Jung
andreas@digicool.com
Fri, 1 Mar 2002 18:54:05 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters
In directory cvs.zope.org:/tmp/cvs-serv12740
Added Files:
Tag: ajung-textindexng-branch
ppt.py
Log Message:
added powerpoint converter
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters/ppt.py ===
# PowerPoint converter
#
# $Id: ppt.py,v 1.1.2.1 2002/03/01 23:54:05 andreasjung Exp $
import tempfile, os
from sgmllib import SGMLParser
from Globals import package_home
from Products.PluginIndexes.TextIndexNG.BaseConverter import BaseConverter
wvConf_file = os.path.join(package_home(globals()), 'wvText.xml')
class _StripTagParser(SGMLParser):
'''SGML Parser removing any tags and translating HTML entities.'''
from htmlentitydefs import entitydefs
data= None
def handle_data(self,data):
if self.data is None: self.data=[]
self.data.append(data)
def __str__(self):
if self.data is None: return ''
return ''.join(self.data)
class Converter(BaseConverter):
content_type = ('application/mspowerpoint', 'application/ms-powerpoint',
'application/vnd.ms-powerpoint')
content_description = "Microsoft PowerPoint"
depends_on = 'pptHtml'
def convert(doc):
"""Convert PowerPoint document to raw text"""
tmp_name = tempfile.mktemp()
open(tmp_name,'w').write(doc)
text = os.popen('pptHtml %s 2> /dev/null' % (tmp_name)).read()
os.remove(tmp_name)
p = _StripTagParser()
p.feed(text)
p.close()
return str(p)