[Checkins] SVN: Products.CMFCore/trunk/Products/CMFCore/ - FSPageTemplate: Change the charset/encoding detection to consider

Jens Vagelpohl jens at dataflake.org
Sun May 24 04:49:42 EDT 2009


Log message for revision 100323:
  - FSPageTemplate: Change the charset/encoding detection to consider
    charset specifications in the content type, and replace the
    hardcoded Latin-15 fallback with the mechanism used by the
    Products.PageTemplate code, which can be influenced by setting
    the environment variable "ZPT_PREFERRED_ENCODING"
    (https://bugs.launchpad.net/zope-cmf/+bug/322263)
  

Changed:
  U   Products.CMFCore/trunk/Products/CMFCore/CHANGES.txt
  U   Products.CMFCore/trunk/Products/CMFCore/FSPageTemplate.py
  A   Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt
  A   Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt.metadata
  U   Products.CMFCore/trunk/Products/CMFCore/tests/test_FSPageTemplate.py

-=-
Modified: Products.CMFCore/trunk/Products/CMFCore/CHANGES.txt
===================================================================
--- Products.CMFCore/trunk/Products/CMFCore/CHANGES.txt	2009-05-24 08:36:22 UTC (rev 100322)
+++ Products.CMFCore/trunk/Products/CMFCore/CHANGES.txt	2009-05-24 08:49:42 UTC (rev 100323)
@@ -4,6 +4,13 @@
 2.2.0 (unreleased)
 ------------------
 
+- FSPageTemplate: Change the charset/encoding detection to consider 
+  charset specifications in the content type, and replace the
+  hardcoded Latin-15 fallback with the mechanism used by the 
+  Products.PageTemplate code, which can be influenced by setting
+  the environment variable "ZPT_PREFERRED_ENCODING"
+  (https://bugs.launchpad.net/zope-cmf/+bug/322263)
+
 - Expose the ZMI manage view of the `_components ` object manager as
   a new "Components Folder" tab in the ZMI.
 

Modified: Products.CMFCore/trunk/Products/CMFCore/FSPageTemplate.py
===================================================================
--- Products.CMFCore/trunk/Products/CMFCore/FSPageTemplate.py	2009-05-24 08:36:22 UTC (rev 100322)
+++ Products.CMFCore/trunk/Products/CMFCore/FSPageTemplate.py	2009-05-24 08:49:42 UTC (rev 100323)
@@ -22,10 +22,11 @@
 from App.class_init import InitializeClass
 from App.special_dtml import DTMLFile
 from Products.PageTemplates.PageTemplate import PageTemplate
+from Products.PageTemplates.utils import charsetFromMetaEquiv
 from Products.PageTemplates.utils import encodingFromXMLPreamble
-from Products.PageTemplates.utils import charsetFromMetaEquiv
+from Products.PageTemplates.ZopePageTemplate import preferred_encodings
+from Products.PageTemplates.ZopePageTemplate import Src
 from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
-from Products.PageTemplates.ZopePageTemplate import Src
 from Shared.DC.Scripts.Script import Script
 
 from Products.CMFCore.DirectoryView import registerFileExtension
@@ -39,8 +40,8 @@
 from Products.CMFCore.utils import _setCacheHeaders
 
 
-
 xml_detect_re = re.compile('^\s*<\?xml\s+(?:[^>]*?encoding=["\']([^"\'>]+))?')
+charset_re = re.compile(r'charset.*?=.*?(?P<charset>[\w\-]*)', re.I|re.M|re.S)
 _marker = object()
 
 
@@ -94,8 +95,10 @@
             # type is initialized as text/html by default, so we only
             # attempt further detection if the default is encountered.
             # One previous misbehavior remains: It is not possible to
-            # force a text./html type if parsing detects it as XML.
+            # force a text/html type if parsing detects it as XML.
             encoding = None
+            preferred = preferred_encodings[:]
+
             if getattr(self, 'content_type', 'text/html') == 'text/html':
                 xml_info = xml_detect_re.match(data)
                 if xml_info:
@@ -104,24 +107,42 @@
                     encoding = xml_info.group(1) or 'utf-8'
                     self.content_type = 'text/xml; charset=%s' % encoding
 
+            if not isinstance(data, unicode):
+                if encoding is None:
+                    charset = getattr(self, 'charset', None)
 
-            if encoding is None:
-                charset = getattr(self, 'charset', None)
-                if charset is None:
-                    if self.content_type.startswith('text/html'):
-                        charset = charsetFromMetaEquiv(data) or 'iso-8859-15'
-                    elif self.content_type.startswith('text/xml'):
-                        charset = encodingFromXMLPreamble(data)
-                    else:
-                        raise ValueError('Unsupported content-type: %s'
-                                            % self.content_type)
+                    if charset is None:
+                        if self.content_type.startswith('text/html'):
+                            mo = charset_re.search(self.content_type)
+                            if mo:
+                                charset = mo.group(1).lower()
 
-                if not isinstance(data, unicode):
-                    data = unicode(data, charset)
-            else:
-                if not isinstance(data, unicode):
-                    data = unicode(data, encoding)
+                            if charset is None:
+                                charset = charsetFromMetaEquiv(data)
+                                
+                        elif self.content_type.startswith('text/xml'):
+                            charset = encodingFromXMLPreamble(data)
 
+                        else:
+                            raise ValueError('Unsupported content_type: %s'
+                                                % self.content_type)
+
+                    if charset is not None:
+                        preferred.insert(0, charset)
+
+                else:
+                    preferred.insert(0, encoding)
+
+                for enc in preferred:
+                    try:
+                        data = unicode(data, enc)
+                        if isinstance(data, unicode):
+                            break
+                    except UnicodeDecodeError:
+                            continue
+                else:
+                    data = unicode(data)
+
             self.write(data)
 
 

Added: Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt
===================================================================
(Binary files differ)


Property changes on: Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt.metadata
===================================================================
--- Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt.metadata	                        (rev 0)
+++ Products.CMFCore/trunk/Products/CMFCore/tests/fake_skins/fake_skin/testPT5.pt.metadata	2009-05-24 08:49:42 UTC (rev 100323)
@@ -0,0 +1,2 @@
+[default]
+content_type=text/html; charset=utf-16

Modified: Products.CMFCore/trunk/Products/CMFCore/tests/test_FSPageTemplate.py
===================================================================
--- Products.CMFCore/trunk/Products/CMFCore/tests/test_FSPageTemplate.py	2009-05-24 08:36:22 UTC (rev 100322)
+++ Products.CMFCore/trunk/Products/CMFCore/tests/test_FSPageTemplate.py	2009-05-24 08:49:42 UTC (rev 100323)
@@ -96,6 +96,7 @@
         self.assertEqual( self.RESPONSE.getHeader('content-type')
                         , 'text/xml'
                         )
+
     def test_CharsetFromFSMetadata(self):
         # testPT3 is an UTF-16 encoded file (see its .metadatafile)
         # is respected
@@ -114,6 +115,13 @@
         self.failUnless(u'123üöäß' in data)
         self.assertEqual(script.content_type, 'text/html')
 
+    def test_CharsetFromContentTypeMetadata(self):
+        script = self._makeOne('testPT5', 'testPT5.pt')
+        script = script.__of__(self.root)
+        data = script.read()
+        self.failUnless(u'123üöäß' in data)
+        self.assertEqual(script.content_type, 'text/html; charset=utf-16')
+
     def test_BadCall( self ):
         script = self._makeOne( 'testPTbad', 'testPTbad.pt' )
         script = script.__of__(self.root)



More information about the Checkins mailing list