[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - BaseNormalizer.py:1.1.2.1 Setup:1.1.2.1 Normalizer.py:1.1.2.2

Andreas Jung andreas@digicool.com
Sun, 3 Feb 2002 14:09:18 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv626

Modified Files:
      Tag: ajung-textindexng-branch
	Normalizer.py 
Added Files:
      Tag: ajung-textindexng-branch
	BaseNormalizer.py Setup 
Log Message:
added Normalizer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/BaseNormalizer.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

__doc__ =""" 
Word normalizer. A normalizer takes a word and translates its characters
according to a translation table. The functionality is similiar to 
string.translate() but allows to translate multiple characters. 
A normalizer is typically used to translate accented characters to ASCII.
"""

from BTrees.OOBTree import OOBTree
from types import ListType, StringType
import normalizer
import re, os

_basedir = os.path.dirname(__file__)

class NormalizerException(Exception): pass

class BaseNormalizer:
    """ word normalizer """

    def __init__(self, arg):

        if isinstance(arg,ListType):
           self._normalizer = normalizer.Normalizer(arg)

        elif isinstance(arg,StringType):
           self._normalizer = normalizer.Normalizer(
                                   self.readTable(arg)
                                   )
        else: 
            raise NormalizerException, \
               'Unknown type for normalizer constructor'


    def normalize(self, arg):
        """ normalize the string/sequence of strings """

	return self._normalizer.normalize( arg )

    __call__ = normalize


    def  readTable(self, fname):
        """  read a translation table """

        d = []

        try:        
            f = os.path.join(_basedir,'normalizers',fname) 
            lines = open(f).readlines()
        except:
            try: lines = open(fname).readlines()
            except: raise

        for l in lines: 
            if l.startswith('#'): 	continue
            
            fields = l.strip().split()

            d.append( (fields[0], fields[1]) ) 


        return d



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/Setup ===
*shared*
normalizer src/normalizer.c


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/Normalizer.py 1.1.2.1 => 1.1.2.2 ===
 #
 # Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
-# 
+#
 # This software is subject to the provisions of the Zope Public License,
 # Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
 # FOR A PARTICULAR PURPOSE
-# 
+#
 ##############################################################################
 
-__doc__ =""" 
-Word normalizer. A normalizer takes a word and translates its characters
-according to a translation table. The functionality is similiar to 
-string.translate() but allows to translate multiple characters. 
-A normalizer is typically used to translate accented characters to ASCII.
-"""
+__doc__=""" same as BaseStopWords but to be used in Zope """
 
-from BTrees.OOBTree import OOBTree
-from types import DictType, StringType
-import re, os
+from Persistence import Persistent
+from BaseNormalizer import BaseNormalizer
 
-_basedir = os.path.dirname(__file__)
-
-
-class Normalizer:
-    """ word normalizer """
-
-    def __init__(self, arg):
-        self.clear()
-
-        if isinstance(arg,DictType):
-            self._trans.update(arg)
-
-        elif isinstance(arg,StringType):
-            self._trans.update (self.readTranslationTable(arg) )
-
-        else: 
-            raise ValueError
-
-        self.keys       = self._trans.keys
-        self.values     = self._trans.values
-        self.items      = self._trans.items
-        self.has_key    = self._trans.has_key
-        self.get        = self._trans.get
-
-    def __len__(self):      return len(self._trans)
-
-
-    def clear(self):
-        self._trans = OOBTree()
-        self._order = []
-
-
-    def normalize(self, word):
-        """ normalize the word using the given translation table. This
-            functionality *MUST* go into a C extension for performance
-            reasons !!!
-        """
-
-        for token in self._order:
-            word = word.replace(token, self._trans[token])
-
-        return word
-
-    __call__ = normalize
-
-
-    def  readTranslationTable(self, fname):
-        """  read a translation table """
-
-        def __ordersort(a,b):    return  cmp(len(a),len(b))
-    
-        d = {}
-
-        try:        
-            f = os.path.join(_basedir,'normalizers',fname) 
-            print f
-            lines = open(f).readlines()
-        except:
-            try: lines = open(fname).readlines()
-            except: raise
-
-        for l in lines: 
-            l      = l.strip()
-            fields = l.split()
-
-            d[fields[0]] = fields[1]
-
-
-        self._order = d.keys()
-        self._order.sort(__ordersort)
-        self._order.reverse()
-
-        return d
 
+class Normalizer(Persistent, BaseNormalizer):
+    pass