Skip to content

Commit

Permalink
Merge pull request #73 from bcgsc/release/v1.8.0_constants_usablity
Browse files Browse the repository at this point in the history
Release/v1.8.0 constants usablity
  • Loading branch information
dustinbleile authored Nov 2, 2022
2 parents 13a8d9d + ba11684 commit 101b616
Show file tree
Hide file tree
Showing 8 changed files with 429 additions and 370 deletions.
208 changes: 2 additions & 206 deletions graphkb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,206 +1,2 @@
import hashlib
import json
from datetime import datetime
from typing import Any, Dict, List, cast, Optional

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from .types import ParsedVariant, Record
from .util import logger

DEFAULT_URL = 'https://graphkb-api.bcgsc.ca/api'
DEFAULT_LIMIT = 1000

QUERY_CACHE: Dict[Any, Any] = {}


def join_url(base_url: str, *parts) -> str:
"""
Join parts of a URL into a full URL
"""
if not parts:
return base_url

if base_url.endswith('/'):
base_url = base_url[:-1]

url = [base_url]

for part in parts:
if not part.startswith('/'):
url.append('/')
url.append(part)
return ''.join(url)


def millis_interval(start: datetime, end: datetime) -> int:
"""start and end are datetime instances"""
diff = end - start
millis = diff.days * 24 * 60 * 60 * 1000
millis += diff.seconds * 1000
millis += diff.microseconds // 1000
return millis


def cache_key(request_body):
"""
create a cache key for a query request to GraphKB
"""
body = json.dumps(request_body, sort_keys=True)
hash_code = hashlib.md5(f'/query{body}'.encode('utf-8')).hexdigest()
return hash_code


class GraphKBConnection:
def __init__(self, url: str = DEFAULT_URL, use_global_cache: bool = True):
self.http = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
self.http.mount("https://", HTTPAdapter(max_retries=retries))

self.token = ''
self.url = url
self.username = ''
self.password = ''
self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE
self.request_count = 0
self.first_request: Optional[datetime] = None
self.last_request: Optional[datetime] = None

@property
def load(self) -> Optional[float]:
if self.first_request and self.last_request:
return (
self.request_count * 1000 / millis_interval(self.first_request, self.last_request)
)
return None

def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict:
"""Request wrapper to handle adding common headers and logging
Args:
endpoint (string): api endpoint, excluding the base uri
method (str, optional): the http method. Defaults to 'GET'.
Returns:
dict: the json response as a python dict
"""
url = join_url(self.url, endpoint)
self.request_count += 1
start_time = datetime.now()
if not self.first_request:
self.first_request = start_time
self.last_request = start_time
resp = requests.request(method, url, headers=self.headers, **kwargs)

if resp.status_code == 401 or resp.status_code == 403:
# try to re-login if the token expired

self.refresh_login()
self.request_count += 1
resp = requests.request(method, url, headers=self.headers, **kwargs)
timing = millis_interval(start_time, datetime.now())
logger.verbose(f'/{endpoint} - {resp.status_code} - {timing} ms') # type: ignore

try:
resp.raise_for_status()
except requests.exceptions.HTTPError as err:
# try to get more error details
message = str(err)
try:
message += ' ' + resp.json()['message']
except Exception:
pass

raise requests.exceptions.HTTPError(message)

return resp.json()

def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict:
"""Convenience method for making post requests"""
return self.request(uri, method='POST', data=json.dumps(data), **kwargs)

def login(self, username: str, password: str) -> None:
self.username = username
self.password = password

# use requests package directly to avoid recursion loop on login failure
self.request_count += 1
resp = requests.request(
url=f'{self.url}/token',
method='POST',
headers=self.headers,
data=json.dumps({'username': username, 'password': password}),
)
resp.raise_for_status()
content = resp.json()
self.token = content['kbToken']
self.headers['Authorization'] = self.token

def refresh_login(self) -> None:
self.login(self.username, self.password)

def set_cache_data(self, request_body: Dict, result: List[Record]) -> None:
"""
Explicitly add a query to the cache
"""
hash_code = cache_key(request_body)
self.cache[hash_code] = result

def query(
self,
request_body: Dict = {},
paginate: bool = True,
ignore_cache: bool = False,
force_refresh: bool = False,
limit: int = DEFAULT_LIMIT,
) -> List[Record]:
"""
Query GraphKB
"""
result: List[Record] = []
hash_code = ""

if not ignore_cache and paginate:
hash_code = cache_key(request_body)
if hash_code in self.cache and not force_refresh:
return self.cache[hash_code]

while True:
content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)})
records = content['result']
result.extend(records)
if len(records) < limit or not paginate:
break

if not ignore_cache and paginate:
self.cache[hash_code] = result
return result

def parse(self, hgvs_string: str, requireFeatures: bool = False) -> ParsedVariant:
content = self.post(
'parse', data={'content': hgvs_string, 'requireFeatures': requireFeatures}
)
return cast(ParsedVariant, content['result'])

def get_records_by_id(self, record_ids: List[str]) -> List[Record]:
if not record_ids:
return []
result = self.query({'target': record_ids})
if len(record_ids) != len(result):
raise AssertionError(
f'The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})'
)
return result

def get_record_by_id(self, record_id: str) -> Record:
result = self.get_records_by_id([record_id])
return result[0]

def get_source(self, name: str) -> Record:
source = self.query({'target': 'Source', 'filters': {'name': name}})
if len(source) != 1:
raise AssertionError(f'Unable to unqiuely identify source with name {name}')
return source[0]
from .constants import DEFAULT_URL
from .util import GraphKBConnection, logger
87 changes: 83 additions & 4 deletions graphkb/constants.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import argparse

from .types import CategoryBaseTermMapping

BASE_RETURN_PROPERTIES = [
'@rid',
'@class',
]
DEFAULT_LIMIT = 1000

GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
DEFAULT_URL = GKB_BASE_URL


BASE_RETURN_PROPERTIES = ['@rid', '@class']

GENERIC_RETURN_PROPERTIES = [
'name',
Expand All @@ -15,6 +22,47 @@
'deprecated',
] + BASE_RETURN_PROPERTIES

GENE_RETURN_PROPERTIES = ['biotype'] + GENERIC_RETURN_PROPERTIES

VARIANT_RETURN_PROPERTIES = (
BASE_RETURN_PROPERTIES
+ [f'type.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ [f'reference1.{p}' for p in GENE_RETURN_PROPERTIES]
+ [f'reference2.{p}' for p in GENE_RETURN_PROPERTIES]
+ ['zygosity', 'germline', 'displayName']
)

POS_VARIANT_RETURN_PROPERTIES = VARIANT_RETURN_PROPERTIES + [
'break1Start',
'break1End',
'break2Start',
'break2End',
'break1Repr',
'break2Repr',
'refSeq',
'untemplatedSeq',
'untemplatedSeqSize',
'truncation',
'assembly',
]

STATEMENT_RETURN_PROPERTIES = (
BASE_RETURN_PROPERTIES
+ ['displayNameTemplate', 'sourceId', 'source.name', 'source.displayName']
+ [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]
+ ['reviewStatus']
)


ONCOKB_SOURCE_NAME = 'oncokb'
ONCOGENE = 'oncogenic'
TUMOUR_SUPPRESSIVE = 'tumour suppressive'

FUSION_NAMES = ['structural variant', 'fusion']
BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
# the order here is the order these are applied, the first category matched is returned
RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
Expand All @@ -26,6 +74,7 @@
('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
]

AMBIGUOUS_AA = ['x', '?', 'X']
AA_3to1_MAPPING = {
'Ala': 'A',
'Arg': 'R',
Expand All @@ -51,3 +100,33 @@
'Val': 'V',
'Ter': '*',
}


class IterableNamespace(argparse.Namespace):
def __init__(self, *pos, **kwargs):
argparse.Namespace.__init__(self, *pos, **kwargs)

def keys(self):
return self.__dict__.keys()

def items(self):
return self.__dict__.items()

def values(self):
return self.__dict__.values()

def __getitem__(self, key):
return getattr(self, key)


INPUT_COPY_CATEGORIES = IterableNamespace(
AMP='amplification',
ANY_GAIN='copy gain',
ANY_LOSS='copy loss',
DEEP='deep deletion',
GAIN='low level copy gain',
LOSS='shallow deletion',
)
INPUT_EXPRESSION_CATEGORIES = IterableNamespace(
UP='increased expression', DOWN='reduced expression'
)
64 changes: 45 additions & 19 deletions graphkb/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,15 @@
from typing import Any, Dict, List, cast

from . import GraphKBConnection
from .constants import (
BASE_THERAPEUTIC_TERMS,
GENE_RETURN_PROPERTIES,
ONCOGENE,
ONCOKB_SOURCE_NAME,
TUMOUR_SUPPRESSIVE,
)
from .types import Ontology, Statement, Variant

ONCOKB_SOURCE_NAME = 'oncokb'
ONCOGENE = 'oncogenic'
TUMOUR_SUPPRESSIVE = 'tumour suppressive'

FUSION_NAMES = ['structural variant', 'fusion']

GENE_RETURN_PROPERTIES = [
'name',
'@rid',
'@class',
'sourceId',
'sourceIdVersion',
'source.name',
'source.@rid',
'displayName',
'biotype',
'deprecated',
]
from .vocab import get_terms_set


def _get_oncokb_gene_list(
Expand Down Expand Up @@ -81,6 +70,43 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
return _get_oncokb_gene_list(conn, TUMOUR_SUPPRESSIVE)


def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> List[Ontology]:
"""Genes related to a cancer-associated statement in Graphkb."""
therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS)
statements = graphkb_conn.query(
{
'target': 'Statement',
'filters': {'relevance': sorted(list(therapeutic_relevance))},
'returnProperties': ['reviewStatus']
+ [f'conditions.{prop}' for prop in GENE_RETURN_PROPERTIES]
+ [
f'conditions.reference{ref}.{prop}'
for prop in GENE_RETURN_PROPERTIES
for ref in ('1', '2')
],
}
)
genes: List[Ontology] = []
for statement in statements:
if statement['reviewStatus'] == 'failed':
continue
for condition in statement['conditions']:
if condition['@class'] == 'Feature':
genes.append(condition)
elif condition['@class'].endswith('Variant'):
cond = cast(Variant, condition)
if cond['reference1'] and cond['reference1']['@class'] == 'Feature':
genes.append(cond['reference1'])
if cond['reference2'] and cond['reference2']['@class'] == 'Feature':
genes.append(cond['reference2'])
unique_genes: List[Ontology] = []
for gene in genes:
if not gene.get('deprecated', False):
if gene['@rid'] not in [g['@rid'] for g in unique_genes]:
unique_genes.append(gene)
return unique_genes


def get_genes_from_variant_types(
conn: GraphKBConnection,
types: List[str],
Expand Down
Loading

0 comments on commit 101b616

Please sign in to comment.