[Checkins] SVN: Sandbox/luciano/kirbi/src/kirbi/fetch/ fetch refactoring

Luciano Ramalho luciano at ramalho.org
Mon Jul 30 21:10:03 EDT 2007


Log message for revision 78494:
  fetch refactoring
  

Changed:
  U   Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
  U   Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_parse.py
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/invalid-request.xml

-=-
Modified: Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py	2007-07-31 01:08:45 UTC (rev 78493)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py	2007-07-31 01:10:02 UTC (rev 78494)
@@ -1,20 +1,105 @@
 #!/usr/bin/env python
+# encoding: utf-8
 
+try:
+    from lxml import etree
+except ImportError:
+    try:
+        import cElementTree as etree
+    except ImportError:
+        try:
+            import elementtree.ElementTree as etree
+        except ImportError:
+            raise ImportError, "Failed to import ElementTree from any known place"
+
 import httplib2
 from urllib import quote
 from StringIO import StringIO
 from time import sleep
 
+
 """
-NOTE: 0333647289 is a valid ISBN which generates a AWS.InvalidParameterValue
-    from Amazon.com with message: "0333647289 is not a valid value for ItemId"
-    The book is Virtual History: Alternatives and Counterfactuals
-    by Niall Ferguson (Editor)
-    Amazon.com does not have it but Amazon.co.uk does and
-    Google query "isbn 0333647289" also found it here:
-    http://www.alibris.com/search/search.cfm?qwork=7055972
+Structure of the AmazonECS XML response:
+
+ItemLookupResponse
+    OperationRequest
+        (...)
+    Items
+        Request
+            IsValid
+            ItemLookupRequest
+                ItemId
+                ResponseGroup
+            (Errors)
+                (Error)
+                    (Code)
+                    (Message)
+        (Item)
+            (ItemAttributes)
+                (Author)
+                (Creator Role=...)
+
+Notes:
+- Errors element occurs when ISBN is non-existent;
+        in that case, Code contains the string "AWS.InvalidParameterValue"
+- Author element is not always present
+- Author element may be duplicated with the same content,
+        except for whitespace; for example: ISBN=0141000511
 """
 
+FIELD_MAP = [
+    # Book schema -> Amazon ECS element
+    ('title', 'Title'),
+    ('isbn13', 'EAN'),
+    ('edition', 'Edition'),
+    ('publisher', 'Publisher'),
+    ('issued', 'PublicationDate'),
+    ]
+
+CREATOR_TAGS = ['Author', 'Creator']
+
+AMAZON_INVALID_PARAM = 'AWS.InvalidParameterValue'
+
+
+def nsPath(ns, path):
+    parts = path.split('/')
+    return '/'.join([ns+part for part in parts])
+
+def parse(xml):
+    tree = etree.parse(xml)
+    raiz = tree.getroot()
+    # get the XML namespace from the root tag
+    ns = raiz.tag.split('}')[0] + '}'
+    request = raiz.find(nsPath(ns,'Items/Request'))
+    error_code = request.findtext(nsPath(ns,'Errors/Error/Code'))
+    if error_code is None:
+        items = raiz.findall(nsPath(ns,'Items/Item'))
+        #TODO: treat multiple Item elements in Items
+        item = items[0].find(ns+'ItemAttributes')
+        book_dic = {}
+        for field, tag in FIELD_MAP:
+            elem = item.find(ns+tag)
+            if elem is not None:
+                book_dic[field] = elem.text
+        creators = []
+        for tag in CREATOR_TAGS:
+            for elem in item.findall(ns+tag):
+                if elem is None: continue
+                role = elem.attrib.get('Role')
+                if role:
+                    creator = '%s (%s)' % (elem.text, role)
+                else:
+                    creator = elem.text
+                creators.append(creator)
+        if creators:
+            book_dic['creators'] = creators
+        return book_dic
+
+    elif error_code == AMAZON_INVALID_PARAM:
+        return None
+    else:
+        raise LookupError, error_code
+
 class AmazonECS(object):
 
     base_url = """http://ecs.amazonaws.com/onca/xml"""
@@ -25,39 +110,52 @@
         if AssociateTag:
             self.base_params['AssociateTag'] = AssociateTag
         self.httpcli = httplib2.Http('.cache')
-                    
+
     def buildURL(self, **kw):
         query = []
         kw.update(self.base_params)
         for key, val in kw.items():
             query.append('%s=%s' % (key,quote(val)))
         return self.base_url + '?' + '&'.join(query)
-        
+
     def getFile(self, url):
         # Amazon.com ECS agreement imposes a limit of one request per second
         sleep(1)
         resp, content = self.httpcli.request(url, 'GET')
-        self.tree = etree.parse(StringIO(content))
         return resp, content
-        
+
     def itemLookup(self,itemId,response='ItemAttributes'):
-        params = {  'Operation':'ItemLookup', 
+        params = {  'Operation':'ItemLookup',
                     'ItemId':itemId,
                     'ResponseGroup':response
                  }
         url = self.buildURL(**params)
         return self.getFile(url)[1]
-        
-if __name__=='__main__':
+
+if __name__ == '__main__':
+    import sys
+    from pprint import pprint
+    xml = file(sys.argv[1])
+    dic = parse(xml)
+    pprint(dic)
+
     from amazon_config import ACCESS_KEY_ID, ASSOCIATE_TAG
-    
+
     ecs = AmazonECS(ACCESS_KEY_ID, ASSOCIATE_TAG)
     alice = '0393048470'
     gof = '0201633612'
     awpr = '0977616630'
     oss = '1565925823'
     dup = '0141000511'
-    print ecs.itemLookup(oss)
-    
-    
-    
+    erro = '1231231239'
+    print ecs.itemLookup(erro)
+
+"""
+NOTE: 0333647289 is a valid ISBN which generates a AWS.InvalidParameterValue
+    from Amazon.com with message: "0333647289 is not a valid value for ItemId"
+    The book is Virtual History: Alternatives and Counterfactuals
+    by Niall Ferguson (Editor)
+    Amazon.com does not have it but Amazon.co.uk does and
+    Google query "isbn 0333647289" also found it here:
+    http://www.alibris.com/search/search.cfm?qwork=7055972
+"""

Modified: Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_parse.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_parse.py	2007-07-31 01:08:45 UTC (rev 78493)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_parse.py	2007-07-31 01:10:02 UTC (rev 78494)
@@ -1,75 +1,103 @@
 #!/usr/bin/env python
+# encoding: utf-8
 
-import httplib2
-from urllib import quote
-from lxml import etree
-from StringIO import StringIO
-from time import sleep
+"""
+Structure of the AmazonECS XML response:
 
+ItemLookupResponse
+    OperationRequest
+        (...)
+    Items
+        Request
+            IsValid
+            ItemLookupRequest
+                ItemId
+                ResponseGroup
+            (Errors)
+                (Error)
+                    (Code)
+                    (Message)
+        (Item)
+            (ItemAttributes)
+                (Author)
+                (Creator Role=...)
+
+Notes:
+- Errors element occurs when ISBN is non-existent;
+        in that case, Code contains the string "AWS.InvalidParameterValue"
+- Author element is not always present
+- Author element may be duplicated with the same content,
+        except for whitespace; for example: ISBN=0141000511
 """
-NOTE: 0333647289 is a valid ISBN which generates a AWS.InvalidParameterValue
-    from Amazon.com with message: "0333647289 is not a valid value for ItemId"
-    The book is Virtual History: Alternatives and Counterfactuals
-    by Niall Ferguson (Editor)
-    Amazon.com does not have it but Amazon.co.uk does and
-    Google query "isbn 0333647289" also found it here:
-    http://www.alibris.com/search/search.cfm?qwork=7055972
-"""
 
-class AmazonECS(object):
+try:
+    from lxml import etree
+except ImportError:
+    try:
+        import cElementTree as etree
+    except ImportError:
+        try:
+            import elementtree.ElementTree as etree
+        except ImportError:
+            print "Failed to import ElementTree from any known place"
 
-    xml_namespace = """http://webservices.amazon.com/AWSECommerceService/2005-10-05"""
-    base_url = """http://ecs.amazonaws.com/onca/xml"""
+FIELD_MAP = [
+    # Book schema -> Amazon ECS element
+    ('title', 'Title'),
+    ('isbn13', 'EAN'),
+    ('edition', 'Edition'),
+    ('publisher', 'Publisher'),
+    ('issued', 'PublicationDate'),
+    ]
 
-    def __init__(self, AWSAccessKeyId, AssociateTag=None):
-        self.base_params = { 'Service':'AWSECommerceService',
-                             'AWSAccessKeyId':AWSAccessKeyId, }
-        if AssociateTag:
-            self.base_params['AssociateTag'] = AssociateTag
-        self.httpcli = httplib2.Http('.cache')
-                    
-    def buildURL(self, **kw):
-        query = []
-        kw.update(self.base_params)
-        for key, val in kw.items():
-            query.append('%s=%s' % (key,quote(val)))
-        return self.base_url + '?' + '&'.join(query)
-        
-    def getFile(self, url):
-        # Amazon.com ECS agreement imposes a limit of one request per second
-        sleep(1)
-        resp, content = self.httpcli.request(url, 'GET')
-        self.tree = etree.parse(StringIO(content))
-        return resp, content
-        
-    def buildQPath(path, ns):
-        """build a path with fully qualified tags"""
-        ns = '{%s}' % ns
-        parts = path.split('/')
-        return ns+('/'+ns).join(parts)
+CREATOR_TAGS = ['Author', 'Creator']
 
-    def itemLookup(self,itemId,response='ItemAttributes'):
-        params = {  'Operation':'ItemLookup', 
-                    'ItemId':itemId,
-                    'ResponseGroup':response
-                 }
-        url = self.buildURL(**params)
-        return self.getFile(url)[1]
-        
-    def findAll(self,path):
-        pass            
 
+AMAZON_INVALID_PARAM = 'AWS.InvalidParameterValueXX'
 
-if __name__=='__main__':
-    from amazon_config import ACCESS_KEY_ID, ASSOCIATE_TAG
-    
-    ecs = AmazonECS(ACCESS_KEY_ID, ASSOCIATE_TAG)
-    alice = '0393048470'
-    gof = '0201633612'
-    awpr = '0977616630'
-    oss = '1565925823'
-    dup = '0141000511'
-    print ecs.itemLookup(oss)
-    
-    
-    
+
+def nsPath(ns, path):
+    parts = path.split('/')
+    return '/'.join([ns+part for part in parts])
+
+def parse(xml):
+    tree = etree.parse(xml)
+    raiz = tree.getroot()
+    # get the XML namespace from the root tag
+    ns = raiz.tag.split('}')[0] + '}'
+    request = raiz.find(nsPath(ns,'Items/Request'))
+    error_code = request.findtext(nsPath(ns,'Errors/Error/Code'))
+    if error_code is None:
+        items = raiz.findall(nsPath(ns,'Items/Item'))
+        #TODO: treat multiple Item elements in Items
+        item = items[0].find(ns+'ItemAttributes')
+        book_dic = {}
+        for field, tag in FIELD_MAP:
+            elem = item.find(ns+tag)
+            if elem is not None:
+                book_dic[field] = elem.text
+        creators = []
+        for tag in CREATOR_TAGS:
+            for elem in item.findall(ns+tag):
+                if elem is None: continue
+                role = elem.attrib.get('Role')
+                if role:
+                    creator = '%s (%s)' % (elem.text, role)
+                else:
+                    creator = elem.text
+                creators.append(creator)
+        if creators:
+            book_dic['creators'] = creators
+        return book_dic
+
+    elif error_code == AMAZON_INVALID_PARAM:
+        return None
+    else:
+        raise LookupError, error_code
+
+if __name__ == '__main__':
+    import sys
+    from pprint import pprint
+    xml = file(sys.argv[1])
+    dic = parse(xml)
+    pprint(dic)

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/invalid-request.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/invalid-request.xml	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/invalid-request.xml	2007-07-31 01:10:02 UTC (rev 78494)
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05">
+	<OperationRequest>
+		<HTTPHeaders>
+			<Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $"></Header>
+		</HTTPHeaders>
+		<RequestId>0NV290TFMXVW0Y09CXTA</RequestId>
+		<Arguments>
+			<Argument Name="AssociateTag" Value="circulante-20"></Argument>
+			<Argument Name="ItemId" Value="1231231239"></Argument>
+			<Argument Name="Service" Value="AWSECommerceService"></Argument>
+			<Argument Name="ResponseGroup" Value="ItemAttributes"></Argument>
+			<Argument Name="Operation" Value="ItemLookup"></Argument>
+			<Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402"></Argument>
+		</Arguments>
+		<RequestProcessingTime>0.0118951797485352</RequestProcessingTime>
+	</OperationRequest>
+	<Items>
+		<Request>
+			<IsValid>True</IsValid>
+			<ItemLookupRequest>
+				<ItemId>1231231239</ItemId>
+				<ResponseGroup>ItemAttributes</ResponseGroup>
+			</ItemLookupRequest>
+			<Errors>
+				<Error>
+					<Code>AWS.InvalidParameterValue</Code>
+					<Message>1231231239 is not a valid value for ItemId. Please change this value and retry your request.</Message>
+				</Error>
+			</Errors>
+		</Request>
+	</Items>
+</ItemLookupResponse>



More information about the Checkins mailing list