""" :module StringDBScraper: Hosting the StringDBScraper, an API for the https://string-db.org database web interface. """
from GenDBScraper.RESTScraper import RESTScraper
from GenDBScraper.Utilities import web_utilities
# 3rd party imports
from collections import namedtuple
from doi2bib import crossref
from io import StringIO
from pubmed_lookup import Publication, PubMedLookup
import json
import logging
import os
import pandas
import re
import tempfile
# Configure logging.
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
# Define the query datastructure.
stringdb_query = namedtuple(
'stringdb_query',
field_names=('taxonId', 'features'),
defaults=('216595', []),
)
[docs]class StringDBScraper(RESTScraper):
""" An API for the string-db.org protein interaction database. """
# Class constructor
def __init__(self, query=None):
"""
StringDBScraper constructor.
:param query: The query to submit to string-db.org
:type query: (dict |
"""
# Base class initialization.
base_url = "http://string-db.org"
super().__init__(base_url)
self.query = query
@property
def query(self):
""" Get the query.
:return: The query object.
:rtype: pdc_query
"""
return self.__query
@query.setter
def query(self, val):
""""""
""" Set the query attribute.
:param val: The value to set.
:type val: (stringdb_query | dict)
"""
# Checks
if val is None:
val = stringdb_query(taxonId=None, features=[])
exc = TypeError("The parameter 'query' must be a dict or stringdb_query. Examples: query={'taxonId' : '216595', 'features'=['pflu0916']}; query=straindb_query(taxonId='216595', features['pflu0916', 'pflu0917']).")
if not isinstance(val, (dict, stringdb_query)):
raise exc
# Check keys if dict.
if isinstance(val, dict):
# Only these are acceptable query keywords.
accepted_keys = ('taxonId', 'features')
present_keys = val.keys()
for k in present_keys:
if not k in accepted_keys:
raise KeyError("Only {0:s} are acceptable keys.".format(",".join(accepted_keys)))
# Complete keywords.
if not 'taxonId' in val.keys():
val['taxonId'] = None
if not isinstance(val['taxonId'], (str,int)):
raise TypeError("taxonId must be a valid NCBI taxonId (str or int).")
if not 'features' in val.keys():
raise KeyError("You must specify a list of genes or products ('features').")
# Convert to stringdb_query
logging.info('Query dictionary passed to string-db scraper will now be converted to a stringdb_query object. See reference manual for more details.')
val = stringdb_query(taxonId=val['taxonId'], features=val['features'])
self.__query = val
[docs] def update_features(self):
""" Replace the query features by the string-db identifiers. """
resolved_ids = self.resolve_id(limit=1)
self.query = stringdb_query(taxonId=self.query.taxonId, features=resolved_ids.preferredName.to_list())
[docs] def resolve_id(self, **kwargs):
""" Resolve the given identifier(s) to string-db.org's own identifiers.
:param limit: (Optional): Limit the number of matches per query identifier (best matches come first). Default: limit=1
:type limit: int
"""
""" Taken from http://string-db.org/cgi/help.pl#Mapping-identifiers """
if 'query' in kwargs.keys():
self.query = kwargs['query']
method = "get_string_ids"
query_url = "/".join([self.base_url, 'api', 'json', method])
data = dict(
identifiers="\r".join(self.query.features),
species =self.query.taxonId if self.query.taxonId is not None else "",
limit =1 if not "limit" in kwargs.keys() else kwargs['limit'],
echo_query =1,
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data)
ret = pandas.DataFrame(response.json())
ret.index = ret['queryItem']
del ret['queryItem']
# Re-index.
return ret.reindex(columns=['queryIndex', 'preferredName', 'stringId', 'ncbiTaxonId', 'taxonName', 'annotation'])
[docs] def network_image(self, query=None, image_format='png', flavor=None, white_nodes=None, color_nodes=None, show_image=False):
""" Grab the protein network image for given proteins (genes).
:param query: The (updated) query to submit.
:type query: (stringdb_query | dict)
:param image_format: The image format for the network image (png, svg, hires_png)
:type image_format: str
:param flavor: The type of network to draw between nodes (evidence, confidence (default), or actions).
:type flavor: str
:param white_nodes: The number of white nodes to add. Default is 10 for single queries, 0 for multiple queries.
:type white_nodes: int
:param color_nodes: The number of color nodes to add. Default is 0.
:type color_nodes: int
:param show_image: Whether to render the image (default False). WARNING: untested feature.
:type show_image: bool
"""
""" Inspired by http://string-db.org/cgi/help.pl#Getting-STRING-network-image """
if query is not None:
self.query = query
if not self.connected:
raise IOError("Not connected to string-db.org.")
format_map = {
'png' : 'image',
'image' : 'image',
'hires_png': 'highres_image',
'highres_image' : 'highres_image',
'svg' : 'svg',
}
method = "network"
query_url = "/".join([self.base_url, 'api', format_map[image_format], method])
data = dict(
identifiers = "\r".join(self.query.features),
species = self.query.taxonId if self.query.taxonId is not None else "",
add_white_nodes = white_nodes,
add_color_nodes = color_nodes,
required_score = None,
network_flavor = None,
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
# Determine file extension.
suffix = ".png" if image_format.find("png") else "svg"
# Setup image file.
_, image_file = tempfile.mkstemp(prefix='string-db_network_', suffix=suffix)
with open(image_file, 'wb') as image_fp:
image_fp.write(response.content)
if show_image:
from PIL import Image
Image.open(image_file).show()
return image_file
[docs] def network_interactions(self, nodes=None):
""" Get the string-db network interactions as a pandas.DataFrame.
:param nodes: The number of nodes to to add to the network based on their confidence score.
:type nodes: int
"""
method = "network"
query_url = "/".join([self.base_url, 'api', 'json', method])
data = dict(
identifiers = "\r".join(self.query.features),
species = self.query.taxonId if self.query.taxonId is not None else "",
add_nodes = nodes,
required_score = None,
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
ret = pandas.DataFrame(response.json())
return ret.reindex(columns = [
'stringId_A',
'stringId_B',
'ncbiTaxonId',
'score',
'nscore',
'fscore',
'pscore',
'ascore',
'escore',
'dscore',
'tscore',
]
)
[docs] def interaction_partners(self, required_score=None, limit=None):
""" Get the interaction partners.
:param required_score: The minimum score for an interaction to be considered.
:type nodes: float
:param limit: Limit the number of matches per query identifier (best matches come first). Default: limit=1
:type limit: int
"""
method = "interaction_partners"
query_url = "/".join([self.base_url, 'api', 'json', method])
if limit is not None and not isinstance(limit, int):
raise TypeError("limit must be an integer, {} was supplied.".format(type(limit)))
if not isinstance(required_score, int):
raise TypeError("required_score must be an integer (0 <= required_score <= 1000). It will be devided by 1000 to yield the actual minimum score cutoff.")
data = dict(
identifiers = "\r".join(self.query.features),
species = self.query.taxonId if self.query.taxonId is not None else "",
required_score = required_score,
limit = limit,
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
ret = pandas.DataFrame(response.json())
return ret.reindex(
columns = [
'stringId_A',
'stringId_B',
'ncbiTaxonId',
'score',
'nscore',
'fscore',
'pscore',
'ascore',
'escore',
'dscore',
'tscore',
]
)
[docs] def similarity_scores(self):
""" Get the interaction partners.
:param required_score: The minimum score for an interaction to be considered.
:type nodes: float
"""
raise NotImplementedError("This feature is currently not supported by string-db.org.")
method = "homology"
query_url = "/".join([self.base_url, 'api', 'json', method])
data = dict(
identifiers = "\r".join(self.query.features),
species = self.query.taxonId if self.query.taxonId is not None else "",
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
ret = pandas.DataFrame(response.json())
return ret.reindex(
columns = [
'stringId_A',
'stringId_B',
'bitscore',
'start_A',
'end_A',
'start_B',
'end_B',
'size_B',
]
)
[docs] def functional_enrichments(self):
""" Get the interaction partners.
"""
method = "enrichment"
query_url = "/".join([self.base_url, 'api', 'json', method])
data = dict(
identifiers = "\r".join(self.query.features),
background_string_ids = None,
species = self.query.taxonId if self.query.taxonId is not None else "",
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
# Setup and return dataframe.
ret = pandas.DataFrame(response.json())
return ret.reindex(
columns = [
'category',
'term',
'number_of_genes',
'number_of_genes_in_background',
'ncbiTaxonId',
'inputGenes',
'p_value',
'fdr',
'description',
]
)
[docs] def interaction_enrichments(self, required_score=None):
""" Get the interaction enrichments.
:param required_score: The minimum score for an interaction to be considered.
:type nodes: float
"""
method = "ppi_enrichment"
query_url = "/".join([self.base_url, 'api', 'json', method])
if required_score is not None and not isinstance(required_score, int):
raise TypeError("required_score must be an integer (0 <= required_score <= 1000). It will be devided by 1000 to yield the actual minimum score cutoff.")
data = dict(
identifiers = "\r".join(self.query.features),
background_string_ids = None,
species = self.query.taxonId if self.query.taxonId is not None else "",
caller_identity="https://gendbscraper.readthedocs.io",
)
# Get the response from post.
response = web_utilities.guarded_post(query_url, data=data)
# Setup and return dataframe.
ret = pandas.DataFrame(response.json())
return ret.reindex(
columns = [
'number_of_nodes',
'number_of_edges',
'average_node_degree',
'local_clustering_coefficient',
'expected_number_of_edges',
'p_value',
]
)
if __name__ == "__main__":
from argparse import ArgumentParser
# Setup argument parser.
parser = ArgumentParser()
parser.add_argument("-o",
"--outfile",
dest="outfile",
default=None,
required=False,
help="Where to write the query results.",
)
parser.add_argument("-f",
"--feature",
dest="feature",
default=None,
required=True,
help="The gene/feature to query from pseudomonas.com.")
org_group = parser.add_mutually_exclusive_group(required=True)
org_group.add_argument("-s",
"--strain",
dest="strain",
default=None,
help="The strain to query from pseudomonas.com. Mutually exclusive with parameter -o/--organism option.")
org_group.add_argument("-O",
"--organism",
dest="organism",
default=None,
help="The organism to query from pseudomonas.com. Mutually exclusive with parameter 'strain'.")
# Parse arguments.
args = parser.parse_args()
_run_from_cli(args)