Source code for GenDBScraper.StringDBScraper

""" :module StringDBScraper: Hosting the StringDBScraper, an API for the https://string-db.org database web interface. """

from GenDBScraper.RESTScraper import RESTScraper
from GenDBScraper.Utilities import web_utilities

# 3rd party imports
from collections import namedtuple
from doi2bib import crossref
from io import StringIO
from pubmed_lookup import Publication, PubMedLookup
import json
import logging
import os
import pandas
import re
import tempfile

# Configure logging.
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)

# Define the query datastructure.
stringdb_query = namedtuple(
        'stringdb_query',
        field_names=('taxonId', 'features'),
        defaults=('216595', []),
        )

[docs]class StringDBScraper(RESTScraper): """ An API for the string-db.org protein interaction database. """ # Class constructor def __init__(self, query=None): """ StringDBScraper constructor. :param query: The query to submit to string-db.org :type query: (dict | """ # Base class initialization. base_url = "http://string-db.org" super().__init__(base_url) self.query = query @property def query(self): """ Get the query. :return: The query object. :rtype: pdc_query """ return self.__query @query.setter def query(self, val): """""" """ Set the query attribute. :param val: The value to set. :type val: (stringdb_query | dict) """ # Checks if val is None: val = stringdb_query(taxonId=None, features=[]) exc = TypeError("The parameter 'query' must be a dict or stringdb_query. Examples: query={'taxonId' : '216595', 'features'=['pflu0916']}; query=straindb_query(taxonId='216595', features['pflu0916', 'pflu0917']).") if not isinstance(val, (dict, stringdb_query)): raise exc # Check keys if dict. if isinstance(val, dict): # Only these are acceptable query keywords. accepted_keys = ('taxonId', 'features') present_keys = val.keys() for k in present_keys: if not k in accepted_keys: raise KeyError("Only {0:s} are acceptable keys.".format(",".join(accepted_keys))) # Complete keywords. if not 'taxonId' in val.keys(): val['taxonId'] = None if not isinstance(val['taxonId'], (str,int)): raise TypeError("taxonId must be a valid NCBI taxonId (str or int).") if not 'features' in val.keys(): raise KeyError("You must specify a list of genes or products ('features').") # Convert to stringdb_query logging.info('Query dictionary passed to string-db scraper will now be converted to a stringdb_query object. See reference manual for more details.') val = stringdb_query(taxonId=val['taxonId'], features=val['features']) self.__query = val
[docs] def update_features(self): """ Replace the query features by the string-db identifiers. """ resolved_ids = self.resolve_id(limit=1) self.query = stringdb_query(taxonId=self.query.taxonId, features=resolved_ids.preferredName.to_list())
[docs] def resolve_id(self, **kwargs): """ Resolve the given identifier(s) to string-db.org's own identifiers. :param limit: (Optional): Limit the number of matches per query identifier (best matches come first). Default: limit=1 :type limit: int """ """ Taken from http://string-db.org/cgi/help.pl#Mapping-identifiers """ if 'query' in kwargs.keys(): self.query = kwargs['query'] method = "get_string_ids" query_url = "/".join([self.base_url, 'api', 'json', method]) data = dict( identifiers="\r".join(self.query.features), species =self.query.taxonId if self.query.taxonId is not None else "", limit =1 if not "limit" in kwargs.keys() else kwargs['limit'], echo_query =1, caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data) ret = pandas.DataFrame(response.json()) ret.index = ret['queryItem'] del ret['queryItem'] # Re-index. return ret.reindex(columns=['queryIndex', 'preferredName', 'stringId', 'ncbiTaxonId', 'taxonName', 'annotation'])
[docs] def network_image(self, query=None, image_format='png', flavor=None, white_nodes=None, color_nodes=None, show_image=False): """ Grab the protein network image for given proteins (genes). :param query: The (updated) query to submit. :type query: (stringdb_query | dict) :param image_format: The image format for the network image (png, svg, hires_png) :type image_format: str :param flavor: The type of network to draw between nodes (evidence, confidence (default), or actions). :type flavor: str :param white_nodes: The number of white nodes to add. Default is 10 for single queries, 0 for multiple queries. :type white_nodes: int :param color_nodes: The number of color nodes to add. Default is 0. :type color_nodes: int :param show_image: Whether to render the image (default False). WARNING: untested feature. :type show_image: bool """ """ Inspired by http://string-db.org/cgi/help.pl#Getting-STRING-network-image """ if query is not None: self.query = query if not self.connected: raise IOError("Not connected to string-db.org.") format_map = { 'png' : 'image', 'image' : 'image', 'hires_png': 'highres_image', 'highres_image' : 'highres_image', 'svg' : 'svg', } method = "network" query_url = "/".join([self.base_url, 'api', format_map[image_format], method]) data = dict( identifiers = "\r".join(self.query.features), species = self.query.taxonId if self.query.taxonId is not None else "", add_white_nodes = white_nodes, add_color_nodes = color_nodes, required_score = None, network_flavor = None, caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) # Determine file extension. suffix = ".png" if image_format.find("png") else "svg" # Setup image file. _, image_file = tempfile.mkstemp(prefix='string-db_network_', suffix=suffix) with open(image_file, 'wb') as image_fp: image_fp.write(response.content) if show_image: from PIL import Image Image.open(image_file).show() return image_file
[docs] def network_interactions(self, nodes=None): """ Get the string-db network interactions as a pandas.DataFrame. :param nodes: The number of nodes to to add to the network based on their confidence score. :type nodes: int """ method = "network" query_url = "/".join([self.base_url, 'api', 'json', method]) data = dict( identifiers = "\r".join(self.query.features), species = self.query.taxonId if self.query.taxonId is not None else "", add_nodes = nodes, required_score = None, caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) ret = pandas.DataFrame(response.json()) return ret.reindex(columns = [ 'stringId_A', 'stringId_B', 'ncbiTaxonId', 'score', 'nscore', 'fscore', 'pscore', 'ascore', 'escore', 'dscore', 'tscore', ] )
[docs] def interaction_partners(self, required_score=None, limit=None): """ Get the interaction partners. :param required_score: The minimum score for an interaction to be considered. :type nodes: float :param limit: Limit the number of matches per query identifier (best matches come first). Default: limit=1 :type limit: int """ method = "interaction_partners" query_url = "/".join([self.base_url, 'api', 'json', method]) if limit is not None and not isinstance(limit, int): raise TypeError("limit must be an integer, {} was supplied.".format(type(limit))) if not isinstance(required_score, int): raise TypeError("required_score must be an integer (0 <= required_score <= 1000). It will be devided by 1000 to yield the actual minimum score cutoff.") data = dict( identifiers = "\r".join(self.query.features), species = self.query.taxonId if self.query.taxonId is not None else "", required_score = required_score, limit = limit, caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) ret = pandas.DataFrame(response.json()) return ret.reindex( columns = [ 'stringId_A', 'stringId_B', 'ncbiTaxonId', 'score', 'nscore', 'fscore', 'pscore', 'ascore', 'escore', 'dscore', 'tscore', ] )
[docs] def similarity_scores(self): """ Get the interaction partners. :param required_score: The minimum score for an interaction to be considered. :type nodes: float """ raise NotImplementedError("This feature is currently not supported by string-db.org.") method = "homology" query_url = "/".join([self.base_url, 'api', 'json', method]) data = dict( identifiers = "\r".join(self.query.features), species = self.query.taxonId if self.query.taxonId is not None else "", caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) ret = pandas.DataFrame(response.json()) return ret.reindex( columns = [ 'stringId_A', 'stringId_B', 'bitscore', 'start_A', 'end_A', 'start_B', 'end_B', 'size_B', ] )
[docs] def functional_enrichments(self): """ Get the interaction partners. """ method = "enrichment" query_url = "/".join([self.base_url, 'api', 'json', method]) data = dict( identifiers = "\r".join(self.query.features), background_string_ids = None, species = self.query.taxonId if self.query.taxonId is not None else "", caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) # Setup and return dataframe. ret = pandas.DataFrame(response.json()) return ret.reindex( columns = [ 'category', 'term', 'number_of_genes', 'number_of_genes_in_background', 'ncbiTaxonId', 'inputGenes', 'p_value', 'fdr', 'description', ] )
[docs] def interaction_enrichments(self, required_score=None): """ Get the interaction enrichments. :param required_score: The minimum score for an interaction to be considered. :type nodes: float """ method = "ppi_enrichment" query_url = "/".join([self.base_url, 'api', 'json', method]) if required_score is not None and not isinstance(required_score, int): raise TypeError("required_score must be an integer (0 <= required_score <= 1000). It will be devided by 1000 to yield the actual minimum score cutoff.") data = dict( identifiers = "\r".join(self.query.features), background_string_ids = None, species = self.query.taxonId if self.query.taxonId is not None else "", caller_identity="https://gendbscraper.readthedocs.io", ) # Get the response from post. response = web_utilities.guarded_post(query_url, data=data) # Setup and return dataframe. ret = pandas.DataFrame(response.json()) return ret.reindex( columns = [ 'number_of_nodes', 'number_of_edges', 'average_node_degree', 'local_clustering_coefficient', 'expected_number_of_edges', 'p_value', ] )
if __name__ == "__main__": from argparse import ArgumentParser # Setup argument parser. parser = ArgumentParser() parser.add_argument("-o", "--outfile", dest="outfile", default=None, required=False, help="Where to write the query results.", ) parser.add_argument("-f", "--feature", dest="feature", default=None, required=True, help="The gene/feature to query from pseudomonas.com.") org_group = parser.add_mutually_exclusive_group(required=True) org_group.add_argument("-s", "--strain", dest="strain", default=None, help="The strain to query from pseudomonas.com. Mutually exclusive with parameter -o/--organism option.") org_group.add_argument("-O", "--organism", dest="organism", default=None, help="The organism to query from pseudomonas.com. Mutually exclusive with parameter 'strain'.") # Parse arguments. args = parser.parse_args() _run_from_cli(args)