Source code for emmaa.xdd.xdd_client

import os
import requests
import logging
from indra_db import get_db


logger = logging.getLogger(__name__)
api_key = os.environ.get('XDD_API_KEY')
doc_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/document'
query_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/search'


[docs]def get_document_objects(doi):
    """Get a list of figure/table object dictionaries for a given DOI."""
    logger.info(f'Got a request to get figures for DOI {doi}')
    # Get first batch of results and find the total number of results
    rj = send_document_search_request(doi, page=0)
    if not rj:
        return []
    total = rj.get('total', 0)
    logger.info(f'Got a total of {total} objects')
    objects = rj['objects']
    page = 0
    while len(objects) < total:
        page += 1
        rj = send_document_search_request(doi, page=page)
        if not rj:
            logger.warning(f'Did not get results for {doi} page {page}')
            break
        objects += rj['objects']
    return objects


[docs]def get_document_figures(paper_id, paper_id_type):
    """Get figures and tables from a given paper.

    Parameters
    ----------
    paper_id : str or int
        ID of a paper.
    paper_id_type : str
        A name of a paper ID type (PMID, PMCID, DOI, TRID).

    Returns
    -------
    figures : list[tuple]
        A list of tuples where each tuple is a figure title and bytes content.
    """
    paper_id_type = paper_id_type.upper()
    if paper_id_type == 'DOI':
        doi = paper_id
    else:
        db = get_db('primary')
        if paper_id_type == 'TRID':
            tr = db.select_one(db.TextRef, db.TextRef.id == paper_id)
        elif paper_id_type == 'PMID':
            tr = db.select_one(db.TextRef, db.TextRef.pmid == paper_id)
        elif paper_id_type == 'PMCID':
            tr = db.select_one(db.TextRef, db.TextRef.pmcid == paper_id)
        ref_dict = tr.get_ref_dict()
        doi = ref_dict.get('DOI')
    if not doi:
        logger.warning(f'Could not get DOI from {paper_id_type} {paper_id}, '
                       'returning 0 figures and tables')
        return []
    objects = get_document_objects(doi)
    if not objects:
        return []
    figures = get_figures_from_objects(objects)
    logger.info(f'Returning {len(figures)} figures and tables.')
    return figures


[docs]def get_figures_from_query(query, limit=None):
    """Get figures and tables from a query.

    Parameters
    ----------
    query : str
        An entity name or comma-separated entity names to query for.
    limit : int or None
        A number of figures and tables to return.

    Returns
    -------
    figures : list[tuple]
        A list of tuples where each tuple is a link to the paper, a figure
        title and bytes content.
    """
    logger.info(f'Got a request for query {query} with limit {limit}')
    # Get first batch of results and find the total number of results
    rj = send_query_search_request(query, page=0)
    if not rj:
        return []
    total = rj.get('total', 0)
    logger.info(f'Got a total of {total} objects')
    objects = rj['objects']
    page = 0
    # If there's a limit of number of figures so we can stop when we reach it
    # or when we run out of objects
    if limit:
        figures = get_figures_from_objects(objects, True)
        while len(figures) < limit and len(objects) < total:
            page += 1
            rj = send_query_search_request(query, page)
            if not rj:
                logger.warning(f'Did not get results for {query}, page {page}')
                break
            new_figures = get_figures_from_objects(rj['objects'], True)
            figures += new_figures
            objects += rj['objects']
        figures = figures[: limit]
        logger.info(f'Returning {len(figures)} figures and tables.')
        return figures
    # There's no limit so we want to get all objects before getting figures
    while len(objects) < total:
        page += 1
        rj = send_query_search_request(query, page)
        if not rj:
            logger.warning(f'Did not get results for {query} page {page}')
            break
        objects += rj['objects']
    figures = get_figures_from_objects(objects, True)
    logger.info(f'Returning {len(figures)} figures and tables.')
    return figures


[docs]def send_request(url, params):
    """Send a request and handle potential errors."""
    try:
        res = requests.get(url, params=params)
    # Catch connection error
    except Exception as e:
        logger.info(e)
        return
    try:
        rj = res.json()
        if 'objects' not in rj:
            params.pop('api_key')
            logger.warning(f'Could not get objects for {params}')
            if 'error' in rj:
                logger.warning(rj['error'])
            return
    # Catch bad response
    except Exception as e:
        logger.info(e)
        return
    return rj


[docs]def send_query_search_request(query, page):
    """Send a request to get one page of results for a query."""
    logger.info(f'Sending a request for query {query}, page {page}')
    return send_request(
        query_url,
        {'query': query, 'inclusive': True, 'page': page, 'api_key': api_key})


[docs]def send_document_search_request(doi, page):
    """Send a request to get one page of results for a DOI."""
    logger.info(f'Sending a request for DOI {doi}, page {page}')
    return send_request(doc_url,
                        {'doi': doi, 'api_key': api_key, 'page': page})


[docs]def get_figures_from_objects(objects, paper_links=False):
    """Get a list of paper links, figure titles and their content bytes from
    a list of object dictionaries (returned from query or document api)."""
    figures = []
    for obj in objects:
        for child in obj['children']:
            if child['cls'] in ['Figure', 'Table']:
                txt = child['header_content']
                b = child['bytes']
                if paper_links:
                    urls = set()
                    for link in obj['bibjson']['link']:
                        urls.add(link['url'])
                    figures.append((urls, txt, b))
                else:
                    figures.append((txt, b))
    return figures