Source code for emmaa.priors.reactome_prior

import re
import logging
import requests
from functools import lru_cache

from indra.sources import tas
from indra.databases.uniprot_client import get_gene_name
from indra.databases.hgnc_client import get_hgnc_id, get_uniprot_id

from emmaa.priors import get_drugs_for_gene, SearchTerm

logger = logging.getLogger('reactome_prior')


[docs]def make_prior_from_genes(gene_list):
    """Return reactome prior based on a list of genes

    Parameters
    ----------
    gene_list : list of str
        List of HGNC symbols for genes

    Returns
    -------
    res : list of :py:class:`emmaa.priors.SearchTerm`
        List of search terms corresponding to all genes found in any reactome
        pathway containing one of the genes in the input gene list
    """
    all_reactome_ids = set([])
    for gene_name in gene_list:
        hgnc_id = get_hgnc_id(gene_name)
        uniprot_id = get_uniprot_id(hgnc_id)
        if not uniprot_id:
            logger.warning('Could not get Uniprot ID for HGNC symbol'
                           f' {gene_name}')
            continue
        reactome_ids = rx_id_from_up_id(uniprot_id)
        if not reactome_ids:
            logger.warning('Could not get Reactome ID for Uniprot ID'
                           f' {uniprot_id} with corresonding HGNC symbol'
                           f' {gene_name}')
            continue
        all_reactome_ids.update(reactome_ids)

    all_pathways = set([])
    for reactome_id in all_reactome_ids:
        if not re.match('^R-HSA-[0-9]', reactome_id):
            # skip non-human genes
            continue
        additional_pathways = get_pathways_containing_gene(reactome_id)
        if additional_pathways is not None:
            all_pathways.update(additional_pathways)

    all_genes = set([])
    for pathway in all_pathways:
        additional_genes = get_genes_contained_in_pathway(pathway)
        if additional_genes is not None:
            all_genes.update(additional_genes)

    gene_terms = []
    for uniprot_id in all_genes:
        hgnc_name = get_gene_name(uniprot_id)
        if hgnc_name is None:
            logger.warning('Could not get HGNC name for UniProt ID'
                           f' {uniprot_id}')
            continue
        hgnc_id = get_hgnc_id(hgnc_name)
        if not hgnc_id:
            logger.warning('Could not find HGNC ID for HGNC symbol'
                           f' {hgnc_name} with corresonding Uniprot ID'
                           f' {uniprot_id}')
            continue
        term = SearchTerm(type='gene', name=hgnc_name,
                          search_term=f'"{hgnc_name}"',
                          db_refs={'HGNC': hgnc_id,
                                   'UP': uniprot_id})
        gene_terms.append(term)
    return sorted(gene_terms, key=lambda x: x.name)


[docs]def find_drugs_for_genes(search_terms, drug_gene_stmts=None):
    """Return list of drugs targeting at least one gene from a list of genes

    Parameters
    ----------
    search_terms : list of :py:class:`emmaa.priors.SearchTerm`
        List of search terms for genes

    Returns
    -------
    drug_terms : list of :py:class:`emmaa.priors.SearchTerm`
        List of search terms of drugs targeting at least one of the input genes
    """
    if not drug_gene_stmts:
        drug_gene_stmts = tas.process_from_web().statements
    drug_terms = []
    already_added = set()
    for search_term in search_terms:
        if search_term.type == 'gene':
            hgnc_id = search_term.db_refs['HGNC']
            drugs = get_drugs_for_gene(drug_gene_stmts, hgnc_id)
            for drug in drugs:
                if drug.name not in already_added:
                    drug_terms.append(drug)
                    already_added.add(drug.name)
    return sorted(drug_terms, key=lambda x: x.name)


[docs]@lru_cache(10000)
def rx_id_from_up_id(up_id):
    """Return the Reactome Stable IDs for a given Uniprot ID."""
    react_search_url = 'http://www.reactome.org/ContentService/search/query'
    params = {'query': up_id, 'cluster': 'true', 'species': 'Homo sapiens'}
    headers = {'Accept': 'application/json'}
    res = requests.get(react_search_url, headers=headers, params=params)
    if not res.status_code == 200:
        logger.debug(f'Reactome request to {react_search_url} failed')
        return None
    json = res.json()
    results = json.get('results')
    if not results:
        logger.warning(f'No results for {up_id}')
        return None
    stable_ids = []
    for result in results:
        entries = result.get('entries')
        for entry in entries:
            stable_id = entry.get('stId')
            if not stable_id:
                continue
            stable_ids.append(stable_id)
    return stable_ids


[docs]@lru_cache(100000)
def up_id_from_rx_id(reactome_id):
    """Get the Uniprot ID (referenceEntity) for a given Reactome Stable ID."""
    react_url = 'http://www.reactome.org/ContentService/data/query/' \
                + reactome_id + '/referenceEntity'
    res = requests.get(react_url)
    if not res.status_code == 200:
        return None
    _, entry, entry_type = res.text.split('\t')
    if entry_type != 'ReferenceGeneProduct':
        return None
    id_entry = entry.split(' ')[0]
    db_ns, db_id = id_entry.split(':')
    if db_ns != 'UniProt':
        return None
    return db_id


[docs]@lru_cache(1000)
def get_pathways_containing_gene(reactome_id):
    """"Get all ids for reactom pathways containing some form of an entity

    Parameters
    ----------
    reactome_id : str
        Reactome id for a gene

    Returns
    -------
    pathway_ids : list of str
        List of reactome ids for pathways containing the input gene
    """
    react_url = ('http://www.reactome.org/ContentService/data/pathways/low'
                 f'/entity/{reactome_id}/allForms')
    params = {'species': 'Homo sapiens'}
    headers = {'Accept': 'application/json'}
    res = requests.get(react_url, headers=headers, params=params)
    if not res.status_code == 200:
        logger.warning(f'Request failed for reactome_id {reactome_id}')
        return None
    results = res.json()
    if not results:
        logger.info(f'No results for {reactome_id}')
        return None
    pathway_ids = [pathway['stIdVersion'] for pathway in results]
    return pathway_ids


[docs]@lru_cache(1000)
def get_genes_contained_in_pathway(reactome_id):
    """Get all genes contained in a given pathway

    Parameters
    ----------
    reactome_id : str
        Reactome id for a pathway

    Returns
    -------
    genes : list of str
        List of uniprot ids for all unique genes contained in input pathway
    """
    react_url = ('http://www.reactome.org/ContentService/data'
                 f'/participants/{reactome_id}')
    params = {'species': 'Homo species'}
    headers = {'Accept': 'application/json'}
    res = requests.get(react_url, headers=headers, params=params)
    results = res.json()
    if not res.status_code == 200:
        return None
    if not results:
        logger.info(f'No results for {reactome_id}')
    genes = [entity['identifier'] for result in results
             for entity in result['refEntities']
             if entity.get('schemaClass') == 'ReferenceGeneProduct']
    return list(set(genes))