[Checkins] SVN: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/
refactoring of fetch
Luciano Ramalho
luciano at ramalho.org
Mon Aug 6 15:40:22 EDT 2007
Log message for revision 78639:
refactoring of fetch
Changed:
_U Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/
_U Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazon.com/
A Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource.py
A Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource_config.py
U Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/fetch.py
D Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon.py
D Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon_config.py
U Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/tests/dummy_server.py
-=-
Property changes on: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch
___________________________________________________________________
Name: svn:ignore
- *.kpf
*.pyc
amazon_config.py
+ *.kpf
*.pyc
amazon.com
Property changes on: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazon.com
___________________________________________________________________
Name: svn:ignore
+ *.xml
Copied: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource.py (from rev 78635, Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon.py)
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource.py (rev 0)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+from zope.interface import implements
+from interfaces import IMetadataSource
+
+from lxml import etree
+
+from urllib import quote
+import sys
+from StringIO import StringIO
+
+from amazonsource_config import ACCESS_KEY_ID, ASSOCIATE_TAG
+
+"""
+Structure of the AmazonECS XML response:
+
+ItemLookupResponse
+ OperationRequest
+ (...)
+ Items
+ Request
+ IsValid
+ ItemLookupRequest
+ ItemId
+ ResponseGroup
+ (Errors)
+ (Error)
+ (Code)
+ (Message)
+ (Item)
+ (ItemAttributes)
+ (Author)
+ (Creator Role=...)
+
+Notes:
+- Errors element occurs when ISBN is non-existent;
+ in that case, Code contains the string "AWS.InvalidParameterValue"
+- Author element is not always present
+- Author element may be duplicated with the same content,
+ except for whitespace; for example: ISBN=0141000511
+"""
+
+FIELD_MAP = [
+ # Book schema -> Amazon ECS element path, relative to Item element
+ ('title', 'ItemAttributes/Title'),
+ ('isbn13', 'ItemAttributes/EAN'),
+ ('edition', 'ItemAttributes/Edition'),
+ ('publisher', 'ItemAttributes/Publisher'),
+ ('issued', 'ItemAttributes/PublicationDate'),
+ ('subjects', 'ItemAttributes/DeweyDecimalNumber'),
+ ('image_url', 'LargeImage/URL'),
+ ('source_url', 'DetailPageURL'),
+ ('source_item_id', 'ASIN'),
+ ]
+
+CREATOR_TAGS = ['ItemAttributes/Author', 'ItemAttributes/Creator']
+
+AMAZON_CODE_NO_MATCH = 'AWS.ECommerceService.NoExactMatches'
+
+class AmazonSource(object):
+ implements(IMetadataSource)
+
+ name = 'amazon.com'
+ max_ids_per_request = 3
+
+
+ base_url = """http://ecs.amazonaws.com/onca/xml"""
+
+ def __init__(self):
+ self.base_params = { 'Service':'AWSECommerceService',
+ 'AWSAccessKeyId':ACCESS_KEY_ID,
+ 'AssociateTag': ASSOCIATE_TAG
+ }
+ self.xml = ''
+ self.http_response = {}
+
+ def buildURL(self, **kw):
+ query = []
+ kw.update(self.base_params)
+ for key, val in kw.items():
+ query.append('%s=%s' % (key,quote(val)))
+ return self.base_url + '?' + '&'.join(query)
+
+ def buildItemLookupURL(self,itemId,response='ItemAttributes'):
+ params = { 'Operation':'ItemLookup',
+ 'ItemId':itemId,
+ 'ResponseGroup':response
+ }
+ return self.buildURL(**params)
+
+ def buildItemSearchURL(self,query,response='ItemAttributes,Images'):
+ params = { 'Operation':'ItemSearch',
+ 'SearchIndex':'Books',
+ 'Power':query,
+ 'ResponseGroup':response
+ }
+ return self.buildURL(**params)
+
+ def buildMultipleBookDetailsURL(self, isbns):
+ query = 'isbn:' + ' or '.join(isbns)
+ return self.buildItemSearchURL(query)
+
+ def nsPath(self, *paths):
+ """Prepend namespace to each part of the path."""
+ parts = []
+ for path in paths:
+ parts.extend(path.split('/'))
+ return '/'.join([self.ns+part for part in parts])
+
+ def parseMultipleBookDetails(self, xml):
+ xml = StringIO(xml)
+ tree = etree.parse(xml)
+ root = tree.getroot()
+ # get the XML namespace from the root tag
+ self.ns = root.tag.split('}')[0] + '}'
+ request = root.find(self.nsPath('Items/Request'))
+ error_code = request.findtext(self.nsPath('Errors/Error/Code'))
+ if error_code is None:
+ book_list = []
+ for item in root.findall(self.nsPath('Items/Item')):
+ book_dic = {}
+ for field, tag in FIELD_MAP:
+ elem = item.find(self.nsPath(tag))
+ if elem is not None:
+ book_dic[field] = elem.text
+ creators = []
+ for tag in CREATOR_TAGS:
+ for elem in item.findall(self.nsPath(tag)):
+ if elem is None: continue
+ role = elem.attrib.get('Role')
+ if role:
+ creator = '%s (%s)' % (elem.text, role)
+ else:
+ creator = elem.text
+ creators.append(creator)
+ if creators:
+ book_dic['creators'] = creators
+ if book_dic.get('subjects'):
+ # subjects is a Tuple field
+ book_dic['subjects'] = (book_dic['subjects'],)
+ book_dic['source'] = self.name
+
+ book_list.append(book_dic)
+ return book_list
+
+ elif error_code == AMAZON_CODE_NO_MATCH:
+ return []
+ else:
+ raise EnvironmentError, error_code
+
Copied: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource_config.py (from rev 78634, Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon_config.py)
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource_config.py (rev 0)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/amazonsource_config.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -0,0 +1,9 @@
+ASSOCIATE_TAG = 'circulante-20'
+ACCESS_KEY_ID = '13W2MMDG65QJJK9GG402'
+
+#UK
+#ASSOCIATE_TAG = 'circulante-21'
+#DE
+#ASSOCIATE_TAG = 'circulante0f-21'
+#FR
+#ASSOCIATE_TAG = 'circulante07-21'
Modified: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/fetch.py
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/fetch.py 2007-08-06 19:06:58 UTC (rev 78638)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/fetch.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -6,7 +6,7 @@
from twisted.web import xmlrpc, client
from os import path
-import source_amazon as amazon
+from amazonsource import AmazonSource
from pprint import pprint
@@ -58,9 +58,17 @@
if url:
filename = book.get('isbn13',book['source_item_id'])
filename += '.' + url.split('.')[-1]
- deferred = client.getPage(url)
- deferred.addCallback(self.downloadedImage, filename)
- deferred.addErrback(self.downloadError, url)
+ # XXX: find a proper way to calculate the static image dir
+ filepath = '../../..'
+ filepath = path.join(filepath,'src','kirbi','static',
+ 'covers','large',filename)
+ # avoid duplicate downloads
+ if not path.exists(filepath):
+ deferred = client.getPage(url)
+ deferred.addCallback(self.downloadedImage, filepath)
+ deferred.addErrback(self.downloadError, url)
+ else:
+ print 'skipping existing:', filepath
if KEEP_FILES:
filename = '_'.join(isbns)+'.xml'
@@ -69,13 +77,9 @@
out.close()
- def downloadedImage(self, bytes, filename):
- # XXX: find a proper way to calculate the static image dir
- dest = '../../..'
- dest = path.join(dest,'src','kirbi','static','covers','large'
- ,filename)
- print 'saving: ', dest
- out = file(dest, 'wb')
+ def downloadedImage(self, bytes, filepath):
+ print 'saving:', filepath
+ out = file(filepath, 'wb')
out.write(bytes)
out.close()
@@ -93,7 +97,7 @@
xmlrpc_url = 'http://localhost:8080/RPC2'
poll_method = 'dump_pending_isbns'
callback = 'add_books'
- fetcher = Fetch(xmlrpc_url, poll_method, callback, amazon.Source())
+ fetcher = Fetch(xmlrpc_url, poll_method, callback, AmazonSource())
reactor.callLater(0, fetcher.poll)
print 'reactor start'
reactor.run()
Deleted: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon.py
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon.py 2007-08-06 19:06:58 UTC (rev 78638)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -1,155 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-from zope.interface import implements
-from interfaces import IMetadataSource
-
-from lxml import etree
-from twisted.internet import reactor
-from twisted.web import xmlrpc, client
-
-from urllib import quote
-from time import sleep
-import sys
-from StringIO import StringIO
-
-
-from source_amazon_config import ACCESS_KEY_ID, ASSOCIATE_TAG
-
-"""
-Structure of the AmazonECS XML response:
-
-ItemLookupResponse
- OperationRequest
- (...)
- Items
- Request
- IsValid
- ItemLookupRequest
- ItemId
- ResponseGroup
- (Errors)
- (Error)
- (Code)
- (Message)
- (Item)
- (ItemAttributes)
- (Author)
- (Creator Role=...)
-
-Notes:
-- Errors element occurs when ISBN is non-existent;
- in that case, Code contains the string "AWS.InvalidParameterValue"
-- Author element is not always present
-- Author element may be duplicated with the same content,
- except for whitespace; for example: ISBN=0141000511
-"""
-
-FIELD_MAP = [
- # Book schema -> Amazon ECS element path, relative to Item element
- ('title', 'ItemAttributes/Title'),
- ('isbn13', 'ItemAttributes/EAN'),
- ('edition', 'ItemAttributes/Edition'),
- ('publisher', 'ItemAttributes/Publisher'),
- ('issued', 'ItemAttributes/PublicationDate'),
- ('subjects', 'ItemAttributes/DeweyDecimalNumber'),
- ('image_url', 'LargeImage/URL'),
- ('source_url', 'DetailPageURL'),
- ('source_item_id', 'ASIN'),
- ]
-
-CREATOR_TAGS = ['ItemAttributes/Author', 'ItemAttributes/Creator']
-
-AMAZON_CODE_NO_MATCH = 'AWS.ECommerceService.NoExactMatches'
-
-class AmazonSource(object):
- implements(IMetadataSource)
-
- name = 'amazon.com'
- max_ids_per_request = 3
-
-
- base_url = """http://ecs.amazonaws.com/onca/xml"""
-
- def __init__(self):
- self.base_params = { 'Service':'AWSECommerceService',
- 'AWSAccessKeyId':ACCESS_KEY_ID,
- 'AssociateTag': ASSOCIATE_TAG
- }
- self.xml = ''
- self.http_response = {}
-
- def buildURL(self, **kw):
- query = []
- kw.update(self.base_params)
- for key, val in kw.items():
- query.append('%s=%s' % (key,quote(val)))
- return self.base_url + '?' + '&'.join(query)
-
- def buildItemLookupURL(self,itemId,response='ItemAttributes'):
- params = { 'Operation':'ItemLookup',
- 'ItemId':itemId,
- 'ResponseGroup':response
- }
- return self.buildURL(**params)
-
- def buildItemSearchURL(self,query,response='ItemAttributes,Images'):
- params = { 'Operation':'ItemSearch',
- 'SearchIndex':'Books',
- 'Power':query,
- 'ResponseGroup':response
- }
- return self.buildURL(**params)
-
- def buildMultipleBookDetailsURL(self, isbns):
- query = 'isbn:' + ' or '.join(isbns)
- return self.buildItemSearchURL(query)
-
- def nsPath(self, *paths):
- """Prepend namespace to each part of the path."""
- parts = []
- for path in paths:
- parts.extend(path.split('/'))
- return '/'.join([self.ns+part for part in parts])
-
- def parseMultipleBookDetails(self, xml):
- xml = StringIO(xml)
- tree = etree.parse(xml)
- root = tree.getroot()
- # get the XML namespace from the root tag
- self.ns = root.tag.split('}')[0] + '}'
- request = root.find(self.nsPath('Items/Request'))
- error_code = request.findtext(self.nsPath('Errors/Error/Code'))
- if error_code is None:
- book_list = []
- for item in root.findall(self.nsPath('Items/Item')):
- book_dic = {}
- for field, tag in FIELD_MAP:
- elem = item.find(self.nsPath(tag))
- if elem is not None:
- book_dic[field] = elem.text
- creators = []
- for tag in CREATOR_TAGS:
- for elem in item.findall(self.nsPath(tag)):
- if elem is None: continue
- role = elem.attrib.get('Role')
- if role:
- creator = '%s (%s)' % (elem.text, role)
- else:
- creator = elem.text
- creators.append(creator)
- if creators:
- book_dic['creators'] = creators
- if book_dic.get('subjects'):
- # subjects is a Tuple field
- book_dic['subjects'] = (book_dic['subjects'],)
- book_dic['source'] = self.name
-
- book_list.append(book_dic)
- return book_list
-
- elif error_code == AMAZON_CODE_NO_MATCH:
- return []
- else:
- raise EnvironmentError, error_code
-
Deleted: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon_config.py
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon_config.py 2007-08-06 19:06:58 UTC (rev 78638)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/source_amazon_config.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -1,9 +0,0 @@
-ASSOCIATE_TAG = 'circulante-20'
-ACCESS_KEY_ID = '13W2MMDG65QJJK9GG402'
-
-#UK
-#ASSOCIATE_TAG = 'circulante-21'
-#DE
-#ASSOCIATE_TAG = 'circulante0f-21'
-#FR
-#ASSOCIATE_TAG = 'circulante07-21'
Modified: Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/tests/dummy_server.py
===================================================================
--- Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/tests/dummy_server.py 2007-08-06 19:06:58 UTC (rev 78638)
+++ Sandbox/luciano/kirbi/kirbifetch/src/kirbifetch/tests/dummy_server.py 2007-08-06 19:40:22 UTC (rev 78639)
@@ -67,7 +67,7 @@
xmlrpc.register_instance(srv)
- print 'SimpleXMLRPCServer running on port %s...', PORT
+ print 'SimpleXMLRPCServer running on port %s...' % PORT
xmlrpc.serve_forever()
More information about the Checkins
mailing list