[Checkins] SVN: Sandbox/luciano/kirbi/src/kirbi/fetch/ continue work on amazon.com data fetch

Luciano Ramalho luciano at ramalho.org
Mon Jul 9 22:29:03 EDT 2007


Log message for revision 77669:
  continue work on amazon.com data fetch
  

Changed:
  U   Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml
  A   Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py

-=-
Modified: Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py	2007-07-09 20:32:11 UTC (rev 77668)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py	2007-07-10 02:29:00 UTC (rev 77669)
@@ -4,7 +4,18 @@
 from urllib import quote
 from lxml import etree
 from StringIO import StringIO
+from time import sleep
 
+"""
+NOTE: 0333647289 is a valid ISBN which generates a AWS.InvalidParameterValue
+    from Amazon.com with message: "0333647289 is not a valid value for ItemId"
+    The book is Virtual History: Alternatives and Counterfactuals
+    by Niall Ferguson (Editor)
+    Amazon.com does not have it but Amazon.co.uk does and
+    Google query "isbn 0333647289" also found it here:
+    http://www.alibris.com/search/search.cfm?qwork=7055972
+"""
+
 class AmazonECS(object):
 
     xml_namespace = """http://webservices.amazon.com/AWSECommerceService/2005-10-05"""
@@ -24,7 +35,9 @@
             query.append('%s=%s' % (key,quote(val)))
         return self.base_url + '?' + '&'.join(query)
         
-    def fetchTree(self, url):
+    def getFile(self, url):
+        # Amazon.com ECS agreement imposes a limit of one request per second
+        sleep(1)
         resp, content = self.httpcli.request(url, 'GET')
         self.tree = etree.parse(StringIO(content))
         return resp, content
@@ -35,32 +48,18 @@
         parts = path.split('/')
         return ns+('/'+ns).join(parts)
 
-    def itemLookup(self,itemId):
-        params = {'Operation':'ItemLookup', 'ItemId':itemId}
+    def itemLookup(self,itemId,response='ItemAttributes'):
+        params = {  'Operation':'ItemLookup', 
+                    'ItemId':itemId,
+                    'ResponseGroup':response
+                 }
         url = self.buildURL(**params)
-        return self.fetchTree(url)
+        return self.getFile(url)[1]
         
     def findAll(self,path):
-        pass
+        pass            
 
 
-def fetch(asin):
-    params['asin'] = asin
-    params['op'] = 'ItemLookup'
-    print asin
-    resp, content = h.request(URL % params, 'GET')
-    tree = etree.parse(StringIO(content))
-    # the tree root is the toplevel html element
-    items = tree.findall(qPath('Items/Item/ItemAttributes',NS))
-    for item in items:
-        print item.find(qPath('Title',NS)).text
-        for author in item.findall(qPath('Author',NS)):
-            print 'author: ', author.text
-        for creator in item.findall(qPath('Creator',NS)):
-            print 'creator: ', creator.text
-            
-
-
 if __name__=='__main__':
     from amazon_config import ACCESS_KEY_ID, ASSOCIATE_TAG
     
@@ -69,7 +68,8 @@
     gof = '0201633612'
     awpr = '0977616630'
     oss = '1565925823'
-    print ecs.itemLookup(alice)
+    dup = '0141000511'
+    print ecs.itemLookup(oss)
     
     
     

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml	2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05">
+	<OperationRequest>
+		<HTTPHeaders>
+			<Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $"></Header>
+		</HTTPHeaders>
+		<RequestId>1MTBTZB8ENGD2XG0F35E</RequestId>
+		<Arguments>
+			<Argument Name="AssociateTag" Value="circulante-20"></Argument>
+			<Argument Name="ItemId" Value="0393048470"></Argument>
+			<Argument Name="Service" Value="AWSECommerceService"></Argument>
+			<Argument Name="ResponseGroup" Value="ItemAttributes"></Argument>
+			<Argument Name="Operation" Value="ItemLookup"></Argument>
+			<Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402"></Argument>
+		</Arguments>
+		<RequestProcessingTime>0.0305209159851074</RequestProcessingTime>
+	</OperationRequest>
+	<Items>
+		<Request>
+			<IsValid>True</IsValid>
+			<ItemLookupRequest>
+			<ItemId>0393048470</ItemId>
+			<ResponseGroup>ItemAttributes</ResponseGroup>
+			</ItemLookupRequest>
+		</Request>
+		<Item>
+			<ASIN>0393048470</ASIN>
+			<DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=0393048470%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0393048470%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL>
+			<ItemAttributes>
+				<Author>Lewis Carroll</Author>
+				<Binding>Hardcover</Binding>
+				<Brand>W.W. Norton &amp; Company</Brand>
+				<Creator Role="Editor">Martin Gardner</Creator>
+				<Creator Role="Illustrator">John Tenniel</Creator>
+				<DeweyDecimalNumber>823.8</DeweyDecimalNumber>
+				<EAN>9780393048476</EAN>
+				<Edition>Upd Sub</Edition>
+				<ISBN>0393048470</ISBN>
+				<Label>W. W. Norton &amp; Company</Label>
+				<ListPrice>
+				<Amount>2995</Amount>
+				<CurrencyCode>USD</CurrencyCode>
+				<FormattedPrice>$29.95</FormattedPrice>
+				</ListPrice>
+				<Manufacturer>W. W. Norton &amp; Company</Manufacturer>
+				<NumberOfItems>1</NumberOfItems>
+				<NumberOfPages>312</NumberOfPages>
+				<PackageDimensions>
+				<Height Units="hundredths-inches">112</Height>
+				<Length Units="hundredths-inches">1023</Length>
+				<Weight Units="hundredths-pounds">246</Weight>
+				<Width Units="hundredths-inches">875</Width>
+				</PackageDimensions>
+				<ProductGroup>Book</ProductGroup>
+				<PublicationDate>1999-11</PublicationDate>
+				<Publisher>W. W. Norton &amp; Company</Publisher>
+				<Studio>W. W. Norton &amp; Company</Studio>
+				<Title>The Annotated Alice: The Definitive Edition</Title>
+			</ItemAttributes>
+		</Item>
+	</Items>
+</ItemLookupResponse>

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml	2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05">
+<OperationRequest>
+<HTTPHeaders>
+<Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $">
+</Header>
+</HTTPHeaders>
+<RequestId>078382479W70DGS4GWCS</RequestId>
+<Arguments>
+<Argument Name="AssociateTag" Value="circulante-20">
+</Argument>
+<Argument Name="ItemId" Value="0141000511">
+</Argument>
+<Argument Name="Service" Value="AWSECommerceService">
+</Argument>
+<Argument Name="ResponseGroup" Value="ItemAttributes">
+</Argument>
+<Argument Name="Operation" Value="ItemLookup">
+</Argument>
+<Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402">
+</Argument>
+</Arguments>
+<RequestProcessingTime>0.0172529220581055</RequestProcessingTime>
+</OperationRequest>
+<Items>
+<Request>
+<IsValid>True</IsValid>
+<ItemLookupRequest>
+<ItemId>0141000511</ItemId>
+<ResponseGroup>ItemAttributes</ResponseGroup>
+</ItemLookupRequest>
+</Request>
+<Item>
+<ASIN>0141000511</ASIN>
+<DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=0141000511%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0141000511%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL>
+<ItemAttributes>
+<Author>Steven  Levy</Author>
+<Author>Steven Levy</Author>
+<Binding>Paperback</Binding>
+<Brand>Penguin Non-Classics</Brand>
+<DeweyDecimalNumber>005</DeweyDecimalNumber>
+<EAN>9780141000510</EAN>
+<Edition>Updated</Edition>
+<ISBN>0141000511</ISBN>
+<Label>Penguin (Non-Classics)</Label>
+<ListPrice>
+<Amount>1600</Amount>
+<CurrencyCode>USD</CurrencyCode>
+<FormattedPrice>$16.00</FormattedPrice>
+</ListPrice>
+<Manufacturer>Penguin (Non-Classics)</Manufacturer>
+<NumberOfItems>1</NumberOfItems>
+<NumberOfPages>464</NumberOfPages>
+<PackageDimensions>
+<Height Units="hundredths-inches">106</Height>
+<Length Units="hundredths-inches">788</Length>
+<Weight Units="hundredths-pounds">85</Weight>
+<Width Units="hundredths-inches">528</Width>
+</PackageDimensions>
+<ProductGroup>Book</ProductGroup>
+<PublicationDate>2001-01</PublicationDate>
+<Publisher>Penguin (Non-Classics)</Publisher>
+<ReleaseDate>2001-01-02</ReleaseDate>
+<Studio>Penguin (Non-Classics)</Studio>
+<Title>Hackers: Heroes of the Computer Revolution</Title>
+</ItemAttributes>
+</Item>
+</Items>
+</ItemLookupResponse>

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt	2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,36 @@
+Author	Chris DiBona
+Author	Sam Ockman
+Author	Mark Stone
+Author	Brian Behlendorf
+Author	Scott Bradner
+Author	Jim Hamerly
+Author	Kirk McKusick
+Author	Tim O'Reilly
+Author	Tom Paquin
+Author	Bruce Perens
+Author	Eric Raymond
+Author	Richard Stallman
+Author	Michael Tiemann
+Author	Linus Torvalds
+Author	Paul Vixie
+Author	Larry Wall
+Author	Bob Young
+Binding	Paperback
+Brand	O'Reilly Media
+DeweyDecimalNumber	5.1068
+EAN	9781565925823
+Edition	1
+Format	Illustrated
+ISBN	1565925823
+Label	O'Reilly Media, Inc.
+ListPrice	
+Manufacturer	O'Reilly Media, Inc.
+NumberOfItems	1
+NumberOfPages	280
+PackageDimensions	
+ProductGroup	Book
+PublicationDate	1999-01
+Publisher	O'Reilly Media, Inc.
+Studio	O'Reilly Media, Inc.
+Title	Open Sources: Voices from the Open Source Revolution (O'Reilly Open Source)
+UPC	636920925828

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml	2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8"?><ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"><OperationRequest><HTTPHeaders><Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $"></Header></HTTPHeaders><RequestId>1S4RCZT6BWKTCPN573E0</RequestId><Arguments><Argument Name="AssociateTag" Value="circulante-20"></Argument><Argument Name="ItemId" Value="1565925823"></Argument><Argument Name="Service" Value="AWSECommerceService"></Argument><Argument Name="ResponseGroup" Value="ItemAttributes"></Argument><Argument Name="Operation" Value="ItemLookup"></Argument><Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402"></Argument></Arguments><RequestProcessingTime>0.0315189361572266</RequestProcessingTime></OperationRequest><Items><Request><IsValid>True</IsValid><ItemLookupRequest><ItemId>1565925823</ItemId><ResponseGroup>ItemAttributes</ResponseGroup></ItemLookupRequest></Request><Item><ASIN>1565925823</ASIN><DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=1565925823%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/1565925823%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL><ItemAttributes><Author>Chris DiBona</Author><Author>Sam Ockman</Author><Author>Mark Stone</Author><Author>Brian Behlendorf</Author><Author>Scott Bradner</Author><Author>Jim Hamerly</Author><Author>Kirk McKusick</Author><Author>Tim O'Reilly</Author><Author>Tom Paquin</Author><Author>Bruce Perens</Author><Author>Eric Raymond</Author><Author>Richard Stallman</Author><Author>Michael Tiemann</Author><Author>Linus Torvalds</Author><Author>Paul Vixie</Author><Author>Larry Wall</Author><Author>Bob Young</Author><Binding>Paperback</Binding><Brand>O'Reilly Media</Brand><DeweyDecimalNumber>005.1068</DeweyDecimalNumber><EAN>9781565925823</EAN><Edition>1</Edition><Format>Illustrated</Format><ISBN>1565925823</ISBN><Label>O'Reilly Media, Inc.</Label><ListPrice><Amount>2495</Amount><CurrencyCode>USD</CurrencyCode><FormattedPrice>$24.95</FormattedPrice></ListPrice><Manufacturer>O'Reilly Media, Inc.</Manufacturer><NumberOfItems>1</NumberOfItems><NumberOfPages>280</NumberOfPages><PackageDimensions><Height Units="hundredths-inches">69</Height><Length Units="hundredths-inches">916</Length><Weight Units="hundredths-pounds">106</Weight><Width Units="hundredths-inches">704</Width></PackageDimensions><ProductGroup>Book</ProductGroup><PublicationDate>1999-01</PublicationDate><Publisher>O'Reilly Media, Inc.</Publisher><Studio>O'Reilly Media, Inc.</Studio><Title>Open Sources: Voices from the Open Source Revolution (O'Reilly Open Source)</Title><UPC>636920925828</UPC></ItemAttributes></Item></Items></ItemLookupResponse>

Added: Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py	                        (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py	2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+lxml_test.py
+
+Groo notes:
+fixed another Amazon corner case: sometimes the Author element is duplicated
+with the same content! For example: ISBN=0141000511
+fix to handle Amazon corner-case: sometimes they don't have an Author
+element
+
+"""
+from lxml import etree, objectify
+from StringIO import StringIO
+
+from IPython.Shell import IPShellEmbed
+ipshell = IPShellEmbed()
+# ipshell() # this call anywhere in your program will start IPython
+
+def main():
+    xml = file('item-oss.xml')
+    
+    parser = etree.XMLParser(remove_blank_text=True)
+    lookup = objectify.ObjectifyElementClassLookup()
+    parser.setElementClassLookup(lookup)
+    tree = etree.parse(xml, parser)
+    #ipshell()
+    raiz = tree.getroot()
+    assert len(raiz.Items.Item) == 1
+    for attr in raiz.Items.Item.ItemAttributes.getchildren():
+        tag = attr.tag[attr.tag.find('}')+1:]
+        print '%s\t%s' % (tag, attr),
+        if tag == 'Creator':
+            print '(%s)' % attr.get('Role')
+        else:
+            print
+    
+
+
+if __name__ == '__main__':
+    main()
+


Property changes on: Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py
___________________________________________________________________
Name: svn:executable
   + *



More information about the Checkins mailing list