Source code for PyOpenWorm.document

from six.moves.urllib.parse import urlparse, urlencode
from six.moves.urllib.request import Request, urlopen
from six.moves.urllib.error import HTTPError, URLError
import re
import logging
from yarom.graphObject import IdentifierMissingException
from .context import Context
from .dataObject import DataObject, DatatypeProperty, Alias

from PyOpenWorm import bibtex as BIB

logger = logging.getLogger(__name__)


[docs]class WormbaseRetrievalException(Exception): pass
[docs]class PubmedRetrievalException(Exception): pass
# A little bit about why this a separate type from Document: # # This type corresponds to a document which has some statements that we care # about. The key reason this is distinct from Document is that a document need # not provide evidence of anything. For example, the `WormData.n4` file # generated by insert_worm.py is a document, but it doesn't provide any # scientific or logical justification for any of the statements made within it.
[docs]class BaseDocument(DataObject): def make_context_identifier(self): return self.make_identifier(self.identifier) @property def as_context(self): if self.context is not None: return Context.contextualize(self.context)(ident=self.make_context_identifier()) else: return Context(ident=self.make_context_identifier())
[docs]class Document(BaseDocument): """ A representation of some document. Possible keys include:: pmid, pubmed: a pubmed id or url (e.g., 24098140) wbid, wormbase: a wormbase id or url (e.g., WBPaper00044287) doi: a Digitial Object id or url (e.g., s00454-010-9273-0) uri: a URI specific to the document, preferably usable for accessing the document """ author = DatatypeProperty(multiple=True) ''' An author of the document ''' doi = DatatypeProperty() ''' A Digital Object Identifier (DOI), optional ''' uri = DatatypeProperty(multiple=True) ''' A non-standard URI for the document ''' wbid = DatatypeProperty() ''' An ID from WormBase.org that points to a record, optional ''' wormbaseid = Alias(wbid) ''' An alias to `wbid` ''' pmid = DatatypeProperty() ''' A PubMed ID (PMID) that points to a paper ''' year = DatatypeProperty() ''' The year (e.g., publication year) of the document ''' date = Alias(year) ''' Alias to year ''' title = DatatypeProperty() ''' The title of the document ''' def __init__( self, bibtex=None, doi=None, pubmed=None, wormbase=None, **kwargs): """ Parameters ---------- bibtex : string A string containing a single BibTeX entry. Parsed during initialization, but not saved thereafter. optional doi : string A Digital Object Identifier (DOI). optional pubmed : string A PubMed ID (PMID) or URL that points to a paper. Ignored if 'pmid' is provided. optional wormbase : string An ID or URL from WormBase that points to a record. Ignored if `wbid` or `wormbaseid` are provided. optional """ super(Document, self).__init__(**kwargs) self.id_precedence = ('doi', 'pmid', 'wbid', 'uri') if bibtex is not None: self.update_with_bibtex(bibtex) if pubmed is not None and not self.pmid.has_defined_value(): if pubmed[:4] == 'http': _tmp = _pubmed_uri_to_pmid(pubmed) if _tmp is None: raise ValueError("Couldn't convert Pubmed URL to a PubMed ID") pmid = _tmp else: pmid = pubmed self.pmid.set(pmid) if wormbase is not None and not self.wbid.has_defined_value(): if wormbase[:4] == 'http': _tmp = _wormbase_uri_to_wbid(wormbase) if _tmp is None: raise ValueError("Couldn't convert Wormbase URL to a Wormbase ID") wbid = _tmp else: wbid = wormbase self.wbid.set(wbid) if doi is not None: if doi[:4] == 'http': _tmp = _doi_uri_to_doi(doi) if _tmp is not None: doi = _tmp self.doi.set(doi) def update_with_bibtex(self, bibtex): bib_db = BIB.loads(bibtex) if len(bib_db.entries) > 1: raise ValueError('The given BibTex string has %d entries.' ' Cannot determine which entry to use for the document' % len(bib_db)) BIB.update_document_with_bibtex(self, bib_db.entries[0])
[docs] def defined_augment(self): for x in self.id_precedence: if getattr(self, x).has_defined_value(): return True return False
[docs] def identifier_augment(self): for idKind in self.id_precedence: idprop = getattr(self, idKind) if idprop.has_defined_value(): s = str(idKind) + ":" + idprop.defined_values[0].identifier.n3() return self.make_identifier(s) raise IdentifierMissingException(self)
# TODO: Provide a way to override modification of already set values.
[docs] def update_from_wormbase(self, replace_existing=False): """ Queries wormbase for additional data to fill in the Document. If replace_existing is set to `True`, then existing values will be cleared. """ # XXX: wormbase's REST API is pretty sparse in terms of data provided. # Would be better off using AQL or the perl interface # _Very_ few of these have these fields filled in wbid = self.wbid.defined_values if len(wbid) == 1: wbid = wbid[0].identifier.toPython() # get the author try: root = self.conf.get('wormbase_api_root_url', 'http://rest.wormbase.org') url = root + '/rest/widget/paper/' + str(wbid) + '/overview?content-type=application%2Fjson' j = _json_request(url) if 'fields' in j: f = j['fields'] if 'authors' in f: dat = f['authors']['data'] if dat is not None: if replace_existing and self.author.has_defined_value: self.author.clear() for x in dat: self.author.set(x['label']) for fname in ('pmid', 'year', 'title', 'doi'): if fname in f and f[fname]['data'] is not None: attr = getattr(self, fname) if replace_existing and attr.has_defined_value: attr.clear() attr.set(f[fname]['data']) except Exception: logger.warning("Couldn't retrieve Wormbase data", exc_info=True) elif len(wbid) == 0: raise WormbaseRetrievalException("There is no Wormbase ID attached to this Document." " So no data can be retrieved") else: raise WormbaseRetrievalException("There is more than one Wormbase ID attached to this Document." " Please try with just one Wormbase ID")
def _crossref_doi_extract(self): # Extract data from crossref def crRequest(doi): data = {'q': doi} data_encoded = urlencode(data) return _json_request( 'http://search.labs.crossref.org/dois?%s' % data_encoded) doi = self.doi() if doi[:4] == 'http': doi = _doi_uri_to_doi(doi) try: r = crRequest(doi) except Exception: logger.warning("Couldn't retrieve Crossref info", exc_info=True) return # XXX: I don't think coins is meant to be used, but it has structured # data... if len(r) > 0: extra_data = r[0]['coins'].split('&') fields = (x.split("=") for x in extra_data) fields = [[y.replace('+', ' ').strip() for y in x] for x in fields] authors = [x[1] for x in fields if x[0] == 'rft.au'] for a in authors: self.author(a) # no error for bad ids, just an empty list if len(r) > 0: # Crossref can process multiple doi's at one go and return the # metadata. we just need the first one r = r[0] if 'title' in r: self.title(r['title']) if 'year' in r: self.year(r['year']) def update_from_pubmed(self): def pmRequest(pmid): import xml.etree.ElementTree as ET # Python 2.5 and up url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=' + str(pmid) key = self.get('pubmed.api_key', None) if key: url += '&api_key=' + key else: logger.warning("PubMed API key not defined. API calls will be limited.") s = _url_request(url) if hasattr(s, 'charset'): parser = ET.XMLParser(encoding=s.charset) else: parser = None return ET.parse(s, parser) pmid = self.pmid.defined_values if len(pmid) == 1: pmid = pmid[0].identifier.toPython() try: tree = pmRequest(pmid) except Exception: logger.warning("Couldn't retrieve Pubmed info", exc_info=True) return for x in tree.findall('./DocSum/Item[@Name="AuthorList"]/Item'): self.author(x.text) for x in tree.findall('./DocSum/Item[@Name="Title"]'): self.title(x.text) for x in tree.findall('./DocSum/Item[@Name="DOI"]'): self.doi(x.text) for x in tree.findall('./DocSum/Item[@Name="PubDate"]'): self.year(x.text) elif len(pmid) == 0: raise PubmedRetrievalException('No Pubmed ID is attached to this document. Cannot retrieve Pubmed data') else: raise PubmedRetrievalException('More than one Pubmed ID is attached to this document.' ' Please try with just one Pubmed ID')
def _wormbase_uri_to_wbid(uri): return str(urlparse(uri).path.split("/")[2]) def _pubmed_uri_to_pmid(uri): return str(urlparse(uri).path.split("/")[2]) def _doi_uri_to_doi(uri): # DOI URL to DOI translation is complicated. This is a cop-out. parsed = urlparse(uri) if 'doi.org' in parsed.netloc: doi = parsed.path.split("/", 1)[1] else: doi = None return doi class EmptyRes(object): def read(self): return bytes() def _url_request(url, headers={}): try: r = Request(url, headers=headers) s = urlopen(r, timeout=1) info = dict(s.info()) content_type = {k.lower(): info[k] for k in info}['content-type'] md = re.search("charset *= *([^ ]+)", content_type) if md: s.charset = md.group(1) return s except HTTPError: logger.error("Error in request for {}".format(url), exc_info=True) return EmptyRes() except URLError: logger.error("Error in request for {}".format(url), exc_info=True) return EmptyRes() def _json_request(url): import json headers = {'Accept': 'application/json'} try: data = _url_request(url, headers).read().decode('UTF-8') if hasattr(data, 'charset'): return json.loads(data, encoding=data.charset) else: return json.loads(data) except BaseException: logger.warning("Couldn't retrieve JSON data from " + url, exc_info=True) return {} __yarom_mapped_classes__ = (BaseDocument, Document)