#!/usr/bin/env python2.4 """opa.py: OPA Proxies APIs""" __version__ = "0.3" __license__ = "MIT License" __author__ = "Daniel Chudnov " # opa.py # A generic proxy interface for resolving identifiers to well-known # APIs for use in protocols like unAPI. To get started, copy # config.ini.default to config.ini and try the following. # # Examples: # # opa.py info:pmid/12341234 pubmed # - returns Pubmed record 12341234 # # opa.py urn:isbn:0521427061 dc # - returns Dublin Core record for ISBN 0521427061 # # Uses PyAmazon: # http://www.josephson.org/projects/pyamazon/ # # opa.py http://flickr.com/photos/someuser/12345678 # - returns a list of all the formats available for photo 12345678, # including dc metadata, html link, and all photo shapes and sizes # # opa.py http://flickr.com/photos/someuser/12345678 dc # - returns Dublin Core record for flickr photo 12345678 # # opa.py http://flickr.com/photos/someuser/12345678 wrap # - returns a json structure containing all the available files in all # available formats for this URI; non-text formats are base64-encoded # and indicated with res['base64'] = True; see # http://onebiglibrary.net/node/34 for an explanation of the structure # # Uses flickr.py: # http://jamesclarke.info/projects/flickr/ # # Also requires your own flickr and amazon API keys. :) # # You can specify arbitrary OAI-PMH repository base_urls and regexes # for their respective identifiers in config.ini and it will do the same # above kinds of things for those kinds of identifiers in each specified # repository. # # Run it by itself: # # ./opa.py # # ...and it will start a web server on 127.0.0.1:8080, serving unAPI # requests compliant with unAPI revision 1 on all configured identifer # patterns. # # Uses web.py: # http://webpy.org/ # # Requires: # - ElementTree, http://effbot.org/zone/element-index.htm # - SimpleJSON, http://cheeseshop.python.org/pypi/simplejson # # # CHANGES: # - Got web.py serving unAPI calls # - Set up svn # - Set up configfile for implementers to specify their API keys easily # - Made the identifiers regex-driven # - Added images sizes in the flickr proxy # - Added generic OAI-PMH proxy configurable in the configfile # - Added MODS output to the Amazon proxy # - Assigned copyright and license # - Updated for unAPI revision 1 # - Added implicit json didl-ish "wrap" format, auto-wraps all available formats # in a json wrapper a la description at http://onebiglibrary.net/node/34 # - Updated response format for # - added LAF Provider # - added ottobib, xisbn support for ISBNs and renamed Amazon to Isbn # - Refactored EXTRA_FORMATS out from flickr provider # # TODO: # - Update HTTP status codes to match unAPI recommendations # - Fetch DOI data # - Add MODS output to the Pubmed proxy # - Add a Citeseer provider that's better than the OAI proxy using its soap api: # http://labseer.ist.psu.edu/api/soap_index.asp # - Add a generic OpenSearch (RSS/Atom results) proxy configurable in the configfile # - Add a generic Atom Publishing Protocol proxy configurable in the configfile # - Add caching in case anybody takes this demo app seriously. :) # # opa.py by Daniel Chudnov # Copyright (c) 2005-2007 Yale University School of Medicine. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies # of the Software, and to permit persons to whom the Software is furnished to do # so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # $Id: opa.py 20 2007-02-18 22:12:08Z dlc33 $ import base64 from cStringIO import StringIO import ConfigParser from optparse import OptionParser import re import sys import time import traceback import urllib from xml.dom import pulldom import elementtree.ElementTree as etree from elementtree.ElementTree import Element, SubElement import simplejson import web config = {} KNOWN_TEXT_TYPES = ['text/plain', 'text/html', 'text/xml', 'application/xml'] uri_namespaces = { 'Isbn': r'urn:isbn:([0-9X-]{9,17})', 'Flickr': r'http://flickr.com/photos/\w+/(\d+)', 'LAF' : r'(n[0-9]+-[0-9]+)', 'Pubmed': r'info:pmid/(\d{7,10})', } def generate_mods (): """Generate the skeleton of a MODS record""" root = Element('mods') root.set('xmlns', 'http://www.loc.gov/mods/v3') root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink') root.set('version', '3.0') root.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance') root.set('xsi:schemaLocation', 'http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-0.xsd') root.set('version', '3.0') root.set('xsi:schemaLocation', 'http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-0.xsd') return root def generate_formats_doc (formats=[], id=''): """Generate the formats response for a list of formats and optionally specify the URI""" root = Element('formats') if id: root.set('id', id) for f in formats: format = SubElement(root, 'format') format.set('name', f[0]) format.set('type', f[1]) return etree.tostring(root) class Provider: """ A Provider is a proxy for a remote service API. Provider.FORMATS is the list of the supported fetch() formats for URIs available through this service. The first listed format is the "default". Providers whose objects might have variable formats available should override formats() to insert/alter additional formats. """ FORMATS = [ ('dc', 'text/plain'), ] EXTRA_FORMATS = [] REGEX = '' def __init__ (self, uri=''): """ Initialize with just the URI. Parse the URI for required discrete values as needed. """ self.uri = uri self.parse_uri() def formats (self): """Return the list of supported formats for this Provider.""" return self.FORMATS + self.EXTRA_FORMATS def fetch (self, format=FORMATS[0][0]): """Return the specified URI in the specified format""" if format in [f for f, mt in self.FORMATS]: return {'type': 'string', 'value': getattr(self, 'render_%s' % format)()} elif format == 'wrap': return {'type': 'string', 'value': self.wrap()} elif format in [f[0] for f in self.EXTRA_FORMATS]: for extra_format in self.EXTRA_FORMATS: f, mt, link, label = extra_format if f == format: # Link might or might not have formatting try: return {'type': 'link', 'value': link % self.id} except TypeError: return {'type': 'link', 'value': link} else: return {'type': 'bad: %s' % format} def parse_uri (self): """ If a specific discrete components of a URI are required as discrete values to pass to an API, assign attributes to self for use in other functions. If not, do nothing. """ if self.REGEX: re_k = re.compile(self.REGEX) self.id = re_k.match(self.uri).groups()[0] def mime_type (self, format=''): """For a supported format, return its mime type.""" try: name, type = [(f, mt) for f, mt in self.FORMATS if f == format][0] return type except: return None def wrap (self): d = {} d['items'] = [{ 'uri': self.uri, 'id': self.uri, 'res': [], }] for format in [f for f in self.formats() if not f[0] == 'wrap']: format_name = format[0] format_mime_type = format[1] data = self.fetch(format=format_name) res = { 'mimetype': format_mime_type, 'format': format_name, } try: # Insert this as a trail back for downstream apps unapi_source = '%s?uri=%s&format=%s' % \ (config['opa']['unapi_base_url'], self.uri, format_name) res['unapi_source'] = unapi_source except: # Assume command-line operation or misconfiguration pass # FIXME: mixing meanings; KNOWN_TEXT_TYPES with dynamic handling # per-handler fetch()d result['type'] value settings. if format_mime_type in KNOWN_TEXT_TYPES: res['base64'] = False res['data'] = data['value'] else: if data['type'] == 'link': remote_data = urllib.urlopen(data['value']).read() res['data'] = base64.b64encode(remote_data) res['base64'] = True res['url'] = data['value'] d['items'][0]['res'].append(res) return simplejson.dumps(d) class Isbn (Provider): FORMATS = [ ('dc', 'text/plain'), ('mods', 'application/xml'), ('amazon', 'application/xml'), ] EXTRA_FORMATS = [ ('xisbn', 'application/xml', 'http://old-xisbn.oclc.org/webservices/xisbn/%s', 'xisbn'), ('ottobib', 'text/html', 'http://ottobib.com/isbn/%s/', 'ottobib'), ] REGEX = re.compile(uri_namespaces['Isbn']) def render_dc (self): out = [] import amazon amazon.setLicense(config['api_keys']['amazon']) items = amazon.searchByASIN(self.id) item = items[0] out.append(('title', item.ProductName)) if getattr(item, 'Authors', None): bag_author = item.Authors if bag_author.Author.__class__ == [].__class__: out.extend(('creator', author) for author in bag_author.Author) else: out.append(('creator', bag_author.Author)) out.append(('identifier', self.uri)) out.append(('date', item.ReleaseDate)) out.append(('format', '%s - %s' % (item.Catalog, item.Media))) return '\n'.join('%s\t%s' % (k, v) for k, v in out) def render_mods (self): import amazon amazon.setLicense(config['api_keys']['amazon']) items = amazon.searchByASIN(self.id) item = items[0] root = generate_mods() titleInfo = SubElement(root, 'titleInfo') title = SubElement(titleInfo, 'title') title.text = item.ProductName if item.Authors.Author: auth_list = [] if item.Authors.Author.__class__ == [].__class__: auth_list.extend([au for au in item.Authors.Author]) else: auth_list.append(item.Authors.Author) for auth in auth_list: name = SubElement(root, 'name') namePart = SubElement(name, 'namePart') namePart.text = auth displayForm = SubElement(name, 'displayForm') displayForm.text = auth typeOfResource = SubElement(root, 'typeOfResource') typeOfResource.text = 'text' physicalDescription = SubElement(root, 'physicalDescription') form = SubElement(physicalDescription, 'form') form.set('authority', 'marcform') form.text = 'print' location = SubElement(root, 'location') url = SubElement(location, 'url') url.text = item.URL recordInfo = SubElement(root, 'recordInfo') recordContentSource = SubElement(recordInfo, 'recordContentSource') recordContentSource.text = "http://amazon.com/" recordIdentifier = SubElement(recordInfo, 'recordIdentifier') recordIdentifier.text = self.uri if getattr(item, 'ProductDescription', ''): abstract = SubElement(root, 'abstract') abstract.text = item.ProductDescription return etree.tostring(root) def render_amazon (self): url = 'http://webservices.amazon.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=1Y8MRP2ZG59FVJ4E8CR2&Operation=ItemLookup&ItemId=%s' % \ self.id data = urllib.urlopen(url).read() re_item = re.compile(r'.*(.*).*') match = re_item.search(data) #return data if match: return match.groups()[0] else: raise 'Not Found' class Flickr (Provider): FORMATS = [ ('dc', 'text/plain'), ('mods', 'application/xml'), ] REGEX = re.compile(uri_namespaces['Flickr']) def update_formats (self): """A photo at flickr might have a variety of available sizes.""" if not self.id: return import flickr flickr.API_KEY = config['api_keys']['flickr'] photo = flickr.Photo(self.id) photo._load_properties() time.sleep(2) data = flickr._doget('flickr.photos.getSizes', photo_id=self.id) self.EXTRA_FORMATS = [('html', 'text/html', self.uri, 'html')] for s in data.rsp.sizes.size: self.EXTRA_FORMATS.append(('jpeg_%s' % s.label, 'image/jpeg', s.source, s.label)) def formats (self): """Return all available formats, first updating dynamically if possible.""" self.update_formats() return self.FORMATS + self.EXTRA_FORMATS def fetch (self, format=FORMATS[0][0]): """Return the specified URI in the specified format""" if not getattr(self, 'EXTRA_FORMATS', None): self.update_formats() return Provider.fetch(self, format) def render_dc (self): if not self.id: return '' out = [] import flickr flickr.API_KEY = config['api_keys']['flickr'] photo = flickr.Photo(self.id) photo._load_properties() time.sleep(3) out.append(('identifier', self.uri)) out.append(('date', photo.datetaken[:10])) out.append(('title', photo.title)) out.append(('creator', photo.owner.username)) out.append(('identifier', photo.getURL())) if photo.description: out.append(('description', photo.description)) if photo.tags: out.extend(('subject', tag.raw) for tag in photo.tags) return '\n'.join('%s:\t%s' % (k, v) for k, v in out) def render_mods (self): if not self.id: return '' import flickr flickr.API_KEY = config['api_keys']['flickr'] photo = flickr.Photo(self.id) photo._load_properties() time.sleep(3) root = generate_mods() titleInfo = SubElement(root, 'titleInfo') title = SubElement(titleInfo, 'title') title.text = photo.title name = SubElement(root, 'name') namePart = SubElement(name, 'namePart') namePart.text = photo.owner.username displayForm = SubElement(name, 'displayForm') displayForm.text = photo.owner.username typeOfResource = SubElement(root, 'typeOfResource') typeOfResource.text = 'still image' physicalDescription = SubElement(root, 'physicalDescription') form = SubElement(physicalDescription, 'form') form.set('authority', 'marcform') form.text = 'electronic' internetMediaType = SubElement(physicalDescription, 'internetMediaType') internetMediaType.text = 'image/jpeg' location = SubElement(root, 'location') url = SubElement(location, 'url') url.text = photo.getURL() recordInfo = SubElement(root, 'recordInfo') recordContentSource = SubElement(recordInfo, 'recordContentSource') recordContentSource.text = "http://flickr.com" recordIdentifier = SubElement(recordInfo, 'recordIdentifier') recordIdentifier.text = self.uri if photo.tags: subject = SubElement(root, 'subject') for tag in photo.tags: topic = SubElement(subject, 'topic') topic.text = tag.raw return etree.tostring(root) class LAF (Provider): FORMATS = [] EXTRA_FORMATS = [ ('html', 'text/html', 'http://errol.oclc.org/laf/%s.html', 'html'), ('marcxml', 'application/xml', 'http://errol.oclc.org/laf/%s.xml', 'marcxml'), ] REGEX = re.compile(uri_namespaces['LAF']) class Pubmed (Provider): FORMATS = [ ('pubmed', 'application/xml'), ('text', 'text/plain'), ('asn1', 'text/plain'), ] REGEX = re.compile(uri_namespaces['Pubmed']) def render_pubmed (self): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % self.id data = urllib.urlopen(url).read() tree = etree.parse(StringIO(data)) pa = tree.findall('.//PubmedArticle') if pa: return etree.tostring(pa[0]) else: return data def render_text (self): url = \ 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=text&rettype=citation' % self.id return urllib.urlopen(url).read() def render_asn1 (self): url = \ 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=text' % self.id return urllib.urlopen(url).read() class OAI (Provider): FORMATS = [ ('oai_dc', 'application/xml'), ] REGEX = '' BASE_URL = '' def __init__ (self, uri='', base_url='', id_regex=''): self.BASE_URL = base_url self.REGEX = id_regex Provider.__init__(self, uri=uri) def formats (self): d = urllib.urlopen(self.BASE_URL + \ 'verb=ListMetadataFormats&identifier=%s' % self.uri).read() re_prefixes = re.compile(r'([a-zA-Z0-9_]+)') m = re_prefixes.findall(d) if m: return [(n, 'application/xml') for n in m] else: return self.FORMATS def fetch (self, format=FORMATS[0][0]): """Return the specified URI in the specified format""" if str(format) in [str(f) for f, m in self.formats()]: d = urllib.urlopen(self.BASE_URL + \ 'verb=GetRecord&identifier=%s&metadataPrefix=%s' % (self.uri, format)).read() tree = etree.parse(StringIO(d)) md = tree.findall('.//{http://www.openarchives.org/OAI/2.0/}metadata') if md: return {'type': 'string', 'value': etree.tostring(md[0][0])} else: raise 'Could not find metadata' elif format == 'wrap': return {'type': 'string', 'value': self.wrap()} return {'type': 'bad: (%s not in formats:%s)' % (format, [f for f, m in self.formats()])} # example info uri: tag:amazon.com,2006:asin:1234567890 # returns klass=Amazon def find_handler (uri=''): for klass, regex in uri_namespaces.items(): re_type = re.compile(regex) match = re_type.match(uri) if match: if klass.startswith('oai.'): k = OAI(uri=uri, base_url=config[klass]['base_url'], id_regex=config[klass]['id_regex']) return k else: return getattr(sys.modules[__name__], klass)(uri) return None # Serve unAPI responses via web class unapi: def GET (self, uri): data = web.input(format='') uri = data.get('id', '') format = data.get('format', '') try: # Note: this is where we should check for invalid params combinations # and raise an error to return 400 Bad Request if necessary instead of # just 300 default in else clause if uri: handler = find_handler(uri) if not handler: raise 'Unable to find handler for uri: %s' % uri if format: result = handler.fetch(data.format) if result: if result['type'] == 'string': mime_type = handler.mime_type(data.format) web.header('Content-Type', mime_type) web.output(result['value']) elif result['type'] == 'link': web.found(result['value']) elif result['type'] == 'bad': web.context.status = '415 Unsupported Media Type' web.header('Content-Type', 'text/html') return web.output('Unsupported Media Type') else: web.context.status = '300 Multiple Choices' web.header('Content-Type', 'application/xml') web.output(generate_formats_doc(formats=handler.formats(), id=uri)) else: web.header('Content-Type', 'application/xml') web.output(generate_formats_doc(formats=[ ('wrap', 'application/x-javascript'), ])) except: print traceback.print_exc() web.badrequest() # start a web.py server to serve unAPI responses def server (config={}): try: web.internalerror = web.debugerror urls = ( '(.*)', 'unapi', ) #web.runsimple(unapi, port=options.get('port', 8000)) web.run(urls)#, options.get('port', 8000)) except: print traceback.print_exc() def load_config (file, config={}): config = config.copy() cp = ConfigParser.ConfigParser() cp.read(file) for sec in cp.sections(): name = sec.lower() for opt in cp.options(sec): val = cp.get(sec, opt).strip().replace('\\\\', '\\') try: config[name][opt] = val except: config[name] = {opt: val} return config def main (): global config parser = OptionParser() parser.add_option('-c', '--config', dest='config', default='config.ini', help='path to configuration file') options, args = parser.parse_args(sys.argv) config = load_config(options.config) if not len(args) >= 1: print 'Usage: opa.py identifier [format] (fetch object)' print 'or: opa.py (runs server)' return # Check for OAI services to proxy for k, v in config.items(): if k.startswith('oai.'): uri_namespaces[k] = v['id_regex'] if len(args) == 1: # No params, start the server server(options) else: uri = args[1] try: handler = find_handler(uri) if len(args) == 3: result = handler.fetch(format=args[2]) if result['type'] in ('string', 'link'): print result['value'] else: raise 'Bad URI or format: type', result['type'] else: print handler.formats() except: print traceback.print_exc() if __name__ == '__main__': sys.exit(main())