[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters - ppt.py:1.1.2.1

Andreas Jung andreas@digicool.com
Fri, 1 Mar 2002 18:54:05 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters
In directory cvs.zope.org:/tmp/cvs-serv12740

Added Files:
      Tag: ajung-textindexng-branch
	ppt.py 
Log Message:
added powerpoint converter


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/converters/ppt.py ===
# PowerPoint converter
#
# $Id: ppt.py,v 1.1.2.1 2002/03/01 23:54:05 andreasjung Exp $

import tempfile, os
from sgmllib import SGMLParser
from Globals import package_home
from Products.PluginIndexes.TextIndexNG.BaseConverter import BaseConverter

wvConf_file = os.path.join(package_home(globals()), 'wvText.xml')

class _StripTagParser(SGMLParser):
  '''SGML Parser removing any tags and translating HTML entities.'''
 
  from htmlentitydefs import entitydefs
 
  data= None
 
  def handle_data(self,data):
    if self.data is None: self.data=[]
    self.data.append(data)
 
  def __str__(self):
    if self.data is None: return ''
    return ''.join(self.data)


class Converter(BaseConverter):

    content_type = ('application/mspowerpoint', 'application/ms-powerpoint', 
                'application/vnd.ms-powerpoint')
    content_description = "Microsoft PowerPoint"
    depends_on = 'pptHtml'

    def convert(doc):
        """Convert PowerPoint document to raw text"""
        
        tmp_name = tempfile.mktemp()
        open(tmp_name,'w').write(doc)
        text = os.popen('pptHtml %s 2> /dev/null' % (tmp_name)).read()
        os.remove(tmp_name)

        p = _StripTagParser()
     
        p.feed(text)
        p.close()
        return str(p)