Source code for emmaa.xdd.xdd_client
import os
import requests
import logging
from indra_db import get_db
logger = logging.getLogger(__name__)
api_key = os.environ.get('XDD_API_KEY')
doc_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/document'
query_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/search'
[docs]def get_document_objects(doi):
"""Get a list of figure/table object dictionaries for a given DOI."""
logger.info(f'Got a request to get figures for DOI {doi}')
# Get first batch of results and find the total number of results
rj = send_document_search_request(doi, page=0)
if not rj:
return []
total = rj.get('total', 0)
logger.info(f'Got a total of {total} objects')
objects = rj['objects']
page = 0
while len(objects) < total:
page += 1
rj = send_document_search_request(doi, page=page)
if not rj:
logger.warning(f'Did not get results for {doi} page {page}')
break
objects += rj['objects']
return objects
[docs]def get_document_figures(paper_id, paper_id_type):
"""Get figures and tables from a given paper.
Parameters
----------
paper_id : str or int
ID of a paper.
paper_id_type : str
A name of a paper ID type (PMID, PMCID, DOI, TRID).
Returns
-------
figures : list[tuple]
A list of tuples where each tuple is a figure title and bytes content.
"""
paper_id_type = paper_id_type.upper()
if paper_id_type == 'DOI':
doi = paper_id
else:
db = get_db('primary')
if paper_id_type == 'TRID':
tr = db.select_one(db.TextRef, db.TextRef.id == paper_id)
elif paper_id_type == 'PMID':
tr = db.select_one(db.TextRef, db.TextRef.pmid == paper_id)
elif paper_id_type == 'PMCID':
tr = db.select_one(db.TextRef, db.TextRef.pmcid == paper_id)
ref_dict = tr.get_ref_dict()
doi = ref_dict.get('DOI')
if not doi:
logger.warning(f'Could not get DOI from {paper_id_type} {paper_id}, '
'returning 0 figures and tables')
return []
objects = get_document_objects(doi)
if not objects:
return []
figures = get_figures_from_objects(objects)
logger.info(f'Returning {len(figures)} figures and tables.')
return figures
[docs]def get_figures_from_query(query, limit=None):
"""Get figures and tables from a query.
Parameters
----------
query : str
An entity name or comma-separated entity names to query for.
limit : int or None
A number of figures and tables to return.
Returns
-------
figures : list[tuple]
A list of tuples where each tuple is a link to the paper, a figure
title and bytes content.
"""
logger.info(f'Got a request for query {query} with limit {limit}')
# Get first batch of results and find the total number of results
rj = send_query_search_request(query, page=0)
if not rj:
return []
total = rj.get('total', 0)
logger.info(f'Got a total of {total} objects')
objects = rj['objects']
page = 0
# If there's a limit of number of figures so we can stop when we reach it
# or when we run out of objects
if limit:
figures = get_figures_from_objects(objects, True)
while len(figures) < limit and len(objects) < total:
page += 1
rj = send_query_search_request(query, page)
if not rj:
logger.warning(f'Did not get results for {query}, page {page}')
break
new_figures = get_figures_from_objects(rj['objects'], True)
figures += new_figures
objects += rj['objects']
figures = figures[: limit]
logger.info(f'Returning {len(figures)} figures and tables.')
return figures
# There's no limit so we want to get all objects before getting figures
while len(objects) < total:
page += 1
rj = send_query_search_request(query, page)
if not rj:
logger.warning(f'Did not get results for {query} page {page}')
break
objects += rj['objects']
figures = get_figures_from_objects(objects, True)
logger.info(f'Returning {len(figures)} figures and tables.')
return figures
[docs]def send_request(url, params):
"""Send a request and handle potential errors."""
try:
res = requests.get(url, params=params)
# Catch connection error
except Exception as e:
logger.info(e)
return
try:
rj = res.json()
if 'objects' not in rj:
params.pop('api_key')
logger.warning(f'Could not get objects for {params}')
if 'error' in rj:
logger.warning(rj['error'])
return
# Catch bad response
except Exception as e:
logger.info(e)
return
return rj
[docs]def send_query_search_request(query, page):
"""Send a request to get one page of results for a query."""
logger.info(f'Sending a request for query {query}, page {page}')
return send_request(
query_url,
{'query': query, 'inclusive': True, 'page': page, 'api_key': api_key})
[docs]def send_document_search_request(doi, page):
"""Send a request to get one page of results for a DOI."""
logger.info(f'Sending a request for DOI {doi}, page {page}')
return send_request(doc_url,
{'doi': doi, 'api_key': api_key, 'page': page})
[docs]def get_figures_from_objects(objects, paper_links=False):
"""Get a list of paper links, figure titles and their content bytes from
a list of object dictionaries (returned from query or document api)."""
figures = []
for obj in objects:
for child in obj['children']:
if child['cls'] in ['Figure', 'Table']:
txt = child['header_content']
b = child['bytes']
if paper_links:
urls = set()
for link in obj['bibjson']['link']:
urls.add(link['url'])
figures.append((urls, txt, b))
else:
figures.append((txt, b))
return figures