[Checkins] SVN: Sandbox/luciano/kirbi/src/kirbi/fetch/ continue
work on amazon.com data fetch
Luciano Ramalho
luciano at ramalho.org
Mon Jul 9 22:29:03 EDT 2007
Log message for revision 77669:
continue work on amazon.com data fetch
Changed:
U Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
A Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml
A Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml
A Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt
A Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml
A Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py
-=-
Modified: Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py 2007-07-09 20:32:11 UTC (rev 77668)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/amazon_fetch.py 2007-07-10 02:29:00 UTC (rev 77669)
@@ -4,7 +4,18 @@
from urllib import quote
from lxml import etree
from StringIO import StringIO
+from time import sleep
+"""
+NOTE: 0333647289 is a valid ISBN which generates a AWS.InvalidParameterValue
+ from Amazon.com with message: "0333647289 is not a valid value for ItemId"
+ The book is Virtual History: Alternatives and Counterfactuals
+ by Niall Ferguson (Editor)
+ Amazon.com does not have it but Amazon.co.uk does and
+ Google query "isbn 0333647289" also found it here:
+ http://www.alibris.com/search/search.cfm?qwork=7055972
+"""
+
class AmazonECS(object):
xml_namespace = """http://webservices.amazon.com/AWSECommerceService/2005-10-05"""
@@ -24,7 +35,9 @@
query.append('%s=%s' % (key,quote(val)))
return self.base_url + '?' + '&'.join(query)
- def fetchTree(self, url):
+ def getFile(self, url):
+ # Amazon.com ECS agreement imposes a limit of one request per second
+ sleep(1)
resp, content = self.httpcli.request(url, 'GET')
self.tree = etree.parse(StringIO(content))
return resp, content
@@ -35,32 +48,18 @@
parts = path.split('/')
return ns+('/'+ns).join(parts)
- def itemLookup(self,itemId):
- params = {'Operation':'ItemLookup', 'ItemId':itemId}
+ def itemLookup(self,itemId,response='ItemAttributes'):
+ params = { 'Operation':'ItemLookup',
+ 'ItemId':itemId,
+ 'ResponseGroup':response
+ }
url = self.buildURL(**params)
- return self.fetchTree(url)
+ return self.getFile(url)[1]
def findAll(self,path):
- pass
+ pass
-def fetch(asin):
- params['asin'] = asin
- params['op'] = 'ItemLookup'
- print asin
- resp, content = h.request(URL % params, 'GET')
- tree = etree.parse(StringIO(content))
- # the tree root is the toplevel html element
- items = tree.findall(qPath('Items/Item/ItemAttributes',NS))
- for item in items:
- print item.find(qPath('Title',NS)).text
- for author in item.findall(qPath('Author',NS)):
- print 'author: ', author.text
- for creator in item.findall(qPath('Creator',NS)):
- print 'creator: ', creator.text
-
-
-
if __name__=='__main__':
from amazon_config import ACCESS_KEY_ID, ASSOCIATE_TAG
@@ -69,7 +68,8 @@
gof = '0201633612'
awpr = '0977616630'
oss = '1565925823'
- print ecs.itemLookup(alice)
+ dup = '0141000511'
+ print ecs.itemLookup(oss)
Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-alice.xml 2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05">
+ <OperationRequest>
+ <HTTPHeaders>
+ <Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $"></Header>
+ </HTTPHeaders>
+ <RequestId>1MTBTZB8ENGD2XG0F35E</RequestId>
+ <Arguments>
+ <Argument Name="AssociateTag" Value="circulante-20"></Argument>
+ <Argument Name="ItemId" Value="0393048470"></Argument>
+ <Argument Name="Service" Value="AWSECommerceService"></Argument>
+ <Argument Name="ResponseGroup" Value="ItemAttributes"></Argument>
+ <Argument Name="Operation" Value="ItemLookup"></Argument>
+ <Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402"></Argument>
+ </Arguments>
+ <RequestProcessingTime>0.0305209159851074</RequestProcessingTime>
+ </OperationRequest>
+ <Items>
+ <Request>
+ <IsValid>True</IsValid>
+ <ItemLookupRequest>
+ <ItemId>0393048470</ItemId>
+ <ResponseGroup>ItemAttributes</ResponseGroup>
+ </ItemLookupRequest>
+ </Request>
+ <Item>
+ <ASIN>0393048470</ASIN>
+ <DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=0393048470%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0393048470%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL>
+ <ItemAttributes>
+ <Author>Lewis Carroll</Author>
+ <Binding>Hardcover</Binding>
+ <Brand>W.W. Norton & Company</Brand>
+ <Creator Role="Editor">Martin Gardner</Creator>
+ <Creator Role="Illustrator">John Tenniel</Creator>
+ <DeweyDecimalNumber>823.8</DeweyDecimalNumber>
+ <EAN>9780393048476</EAN>
+ <Edition>Upd Sub</Edition>
+ <ISBN>0393048470</ISBN>
+ <Label>W. W. Norton & Company</Label>
+ <ListPrice>
+ <Amount>2995</Amount>
+ <CurrencyCode>USD</CurrencyCode>
+ <FormattedPrice>$29.95</FormattedPrice>
+ </ListPrice>
+ <Manufacturer>W. W. Norton & Company</Manufacturer>
+ <NumberOfItems>1</NumberOfItems>
+ <NumberOfPages>312</NumberOfPages>
+ <PackageDimensions>
+ <Height Units="hundredths-inches">112</Height>
+ <Length Units="hundredths-inches">1023</Length>
+ <Weight Units="hundredths-pounds">246</Weight>
+ <Width Units="hundredths-inches">875</Width>
+ </PackageDimensions>
+ <ProductGroup>Book</ProductGroup>
+ <PublicationDate>1999-11</PublicationDate>
+ <Publisher>W. W. Norton & Company</Publisher>
+ <Studio>W. W. Norton & Company</Studio>
+ <Title>The Annotated Alice: The Definitive Edition</Title>
+ </ItemAttributes>
+ </Item>
+ </Items>
+</ItemLookupResponse>
Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-dup-author.xml 2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05">
+<OperationRequest>
+<HTTPHeaders>
+<Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $">
+</Header>
+</HTTPHeaders>
+<RequestId>078382479W70DGS4GWCS</RequestId>
+<Arguments>
+<Argument Name="AssociateTag" Value="circulante-20">
+</Argument>
+<Argument Name="ItemId" Value="0141000511">
+</Argument>
+<Argument Name="Service" Value="AWSECommerceService">
+</Argument>
+<Argument Name="ResponseGroup" Value="ItemAttributes">
+</Argument>
+<Argument Name="Operation" Value="ItemLookup">
+</Argument>
+<Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402">
+</Argument>
+</Arguments>
+<RequestProcessingTime>0.0172529220581055</RequestProcessingTime>
+</OperationRequest>
+<Items>
+<Request>
+<IsValid>True</IsValid>
+<ItemLookupRequest>
+<ItemId>0141000511</ItemId>
+<ResponseGroup>ItemAttributes</ResponseGroup>
+</ItemLookupRequest>
+</Request>
+<Item>
+<ASIN>0141000511</ASIN>
+<DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=0141000511%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0141000511%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL>
+<ItemAttributes>
+<Author>Steven Levy</Author>
+<Author>Steven Levy</Author>
+<Binding>Paperback</Binding>
+<Brand>Penguin Non-Classics</Brand>
+<DeweyDecimalNumber>005</DeweyDecimalNumber>
+<EAN>9780141000510</EAN>
+<Edition>Updated</Edition>
+<ISBN>0141000511</ISBN>
+<Label>Penguin (Non-Classics)</Label>
+<ListPrice>
+<Amount>1600</Amount>
+<CurrencyCode>USD</CurrencyCode>
+<FormattedPrice>$16.00</FormattedPrice>
+</ListPrice>
+<Manufacturer>Penguin (Non-Classics)</Manufacturer>
+<NumberOfItems>1</NumberOfItems>
+<NumberOfPages>464</NumberOfPages>
+<PackageDimensions>
+<Height Units="hundredths-inches">106</Height>
+<Length Units="hundredths-inches">788</Length>
+<Weight Units="hundredths-pounds">85</Weight>
+<Width Units="hundredths-inches">528</Width>
+</PackageDimensions>
+<ProductGroup>Book</ProductGroup>
+<PublicationDate>2001-01</PublicationDate>
+<Publisher>Penguin (Non-Classics)</Publisher>
+<ReleaseDate>2001-01-02</ReleaseDate>
+<Studio>Penguin (Non-Classics)</Studio>
+<Title>Hackers: Heroes of the Computer Revolution</Title>
+</ItemAttributes>
+</Item>
+</Items>
+</ItemLookupResponse>
Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.txt 2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,36 @@
+Author Chris DiBona
+Author Sam Ockman
+Author Mark Stone
+Author Brian Behlendorf
+Author Scott Bradner
+Author Jim Hamerly
+Author Kirk McKusick
+Author Tim O'Reilly
+Author Tom Paquin
+Author Bruce Perens
+Author Eric Raymond
+Author Richard Stallman
+Author Michael Tiemann
+Author Linus Torvalds
+Author Paul Vixie
+Author Larry Wall
+Author Bob Young
+Binding Paperback
+Brand O'Reilly Media
+DeweyDecimalNumber 5.1068
+EAN 9781565925823
+Edition 1
+Format Illustrated
+ISBN 1565925823
+Label O'Reilly Media, Inc.
+ListPrice
+Manufacturer O'Reilly Media, Inc.
+NumberOfItems 1
+NumberOfPages 280
+PackageDimensions
+ProductGroup Book
+PublicationDate 1999-01
+Publisher O'Reilly Media, Inc.
+Studio O'Reilly Media, Inc.
+Title Open Sources: Voices from the Open Source Revolution (O'Reilly Open Source)
+UPC 636920925828
Added: Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/item-oss.xml 2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8"?><ItemLookupResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"><OperationRequest><HTTPHeaders><Header Name="UserAgent" Value="Python-httplib2/$Rev: 235 $"></Header></HTTPHeaders><RequestId>1S4RCZT6BWKTCPN573E0</RequestId><Arguments><Argument Name="AssociateTag" Value="circulante-20"></Argument><Argument Name="ItemId" Value="1565925823"></Argument><Argument Name="Service" Value="AWSECommerceService"></Argument><Argument Name="ResponseGroup" Value="ItemAttributes"></Argument><Argument Name="Operation" Value="ItemLookup"></Argument><Argument Name="AWSAccessKeyId" Value="13W2MMDG65QJJK9GG402"></Argument></Arguments><RequestProcessingTime>0.0315189361572266</RequestProcessingTime></OperationRequest><Items><Request><IsValid>True</IsValid><ItemLookupRequest><ItemId>1565925823</ItemId><ResponseGroup>ItemAttributes</ResponseGroup></ItemLookupRequest></Request><Item><ASIN>1565925823</ASIN><DetailPageURL>http://www.amazon.com/gp/redirect.html%3FASIN=1565925823%26tag=circulante-20%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/1565925823%253FSubscriptionId=13W2MMDG65QJJK9GG402</DetailPageURL><ItemAttributes><Author>Chris DiBona</Author><Author>Sam Ockman</Author><Author>Mark Stone</Author><Author>Brian Behlendorf</Author><Author>Scott Bradner</Author><Author>Jim Hamerly</Author><Author>Kirk McKusick</Author><Author>Tim O'Reilly</Author><Author>Tom Paquin</Author><Author>Bruce Perens</Author><Author>Eric Raymond</Author><Author>Richard Stallman</Author><Author>Michael Tiemann</Author><Author>Linus Torvalds</Author><Author>Paul Vixie</Author><Author>Larry Wall</Author><Author>Bob Young</Author><Binding>Paperback</Binding><Brand>O'Reilly Media</Brand><DeweyDecimalNumber>005.1068</DeweyDecimalNumber><EAN>9781565925823</EAN><Edition>1</Edition><Format>Illustrated</Format><ISBN>1565925823</ISBN><Label>O'Reilly Media, Inc.</Label><ListPrice><Amount>2495</Amount><CurrencyCode>USD</CurrencyCode><FormattedPrice>$24.95</FormattedPrice></ListPrice><Manufacturer>O'Reilly Media, Inc.</Manufacturer><NumberOfItems>1</NumberOfItems><NumberOfPages>280</NumberOfPages><PackageDimensions><Height Units="hundredths-inches">69</Height><Length Units="hundredths-inches">916</Length><Weight Units="hundredths-pounds">106</Weight><Width Units="hundredths-inches">704</Width></PackageDimensions><ProductGroup>Book</ProductGroup><PublicationDate>1999-01</PublicationDate><Publisher>O'Reilly Media, Inc.</Publisher><Studio>O'Reilly Media, Inc.</Studio><Title>Open Sources: Voices from the Open Source Revolution (O'Reilly Open Source)</Title><UPC>636920925828</UPC></ItemAttributes></Item></Items></ItemLookupResponse>
Added: Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py
===================================================================
--- Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py (rev 0)
+++ Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py 2007-07-10 02:29:00 UTC (rev 77669)
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+lxml_test.py
+
+Groo notes:
+fixed another Amazon corner case: sometimes the Author element is duplicated
+with the same content! For example: ISBN=0141000511
+fix to handle Amazon corner-case: sometimes they don't have an Author
+element
+
+"""
+from lxml import etree, objectify
+from StringIO import StringIO
+
+from IPython.Shell import IPShellEmbed
+ipshell = IPShellEmbed()
+# ipshell() # this call anywhere in your program will start IPython
+
+def main():
+ xml = file('item-oss.xml')
+
+ parser = etree.XMLParser(remove_blank_text=True)
+ lookup = objectify.ObjectifyElementClassLookup()
+ parser.setElementClassLookup(lookup)
+ tree = etree.parse(xml, parser)
+ #ipshell()
+ raiz = tree.getroot()
+ assert len(raiz.Items.Item) == 1
+ for attr in raiz.Items.Item.ItemAttributes.getchildren():
+ tag = attr.tag[attr.tag.find('}')+1:]
+ print '%s\t%s' % (tag, attr),
+ if tag == 'Creator':
+ print '(%s)' % attr.get('Role')
+ else:
+ print
+
+
+
+if __name__ == '__main__':
+ main()
+
Property changes on: Sandbox/luciano/kirbi/src/kirbi/fetch/lxml_test.py
___________________________________________________________________
Name: svn:executable
+ *
More information about the Checkins
mailing list