Merge pull request #73 from bcgsc/release/v1.8.0_constants_usablity

Release/v1.8.0 constants usablity
bcgsc · Nov 2, 2022 · 101b616 · 101b616
2 parents 13a8d9d + ba11684
commit 101b616
Show file tree

Hide file tree

Showing 8 changed files with 429 additions and 370 deletions.
diff --git a/graphkb/__init__.py b/graphkb/__init__.py
@@ -1,206 +1,2 @@
-import hashlib
-import json
-from datetime import datetime
-from typing import Any, Dict, List, cast, Optional
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-
-from .types import ParsedVariant, Record
-from .util import logger
-
-DEFAULT_URL = 'https://graphkb-api.bcgsc.ca/api'
-DEFAULT_LIMIT = 1000
-
-QUERY_CACHE: Dict[Any, Any] = {}
-
-
-def join_url(base_url: str, *parts) -> str:
-    """
-    Join parts of a URL into a full URL
-    """
-    if not parts:
-        return base_url
-
-    if base_url.endswith('/'):
-        base_url = base_url[:-1]
-
-    url = [base_url]
-
-    for part in parts:
-        if not part.startswith('/'):
-            url.append('/')
-        url.append(part)
-    return ''.join(url)
-
-
-def millis_interval(start: datetime, end: datetime) -> int:
-    """start and end are datetime instances"""
-    diff = end - start
-    millis = diff.days * 24 * 60 * 60 * 1000
-    millis += diff.seconds * 1000
-    millis += diff.microseconds // 1000
-    return millis
-
-
-def cache_key(request_body):
-    """
-    create a cache key for a query request to GraphKB
-    """
-    body = json.dumps(request_body, sort_keys=True)
-    hash_code = hashlib.md5(f'/query{body}'.encode('utf-8')).hexdigest()
-    return hash_code
-
-
-class GraphKBConnection:
-    def __init__(self, url: str = DEFAULT_URL, use_global_cache: bool = True):
-        self.http = requests.Session()
-        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
-        self.http.mount("https://", HTTPAdapter(max_retries=retries))
-
-        self.token = ''
-        self.url = url
-        self.username = ''
-        self.password = ''
-        self.headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
-        self.cache: Dict[Any, Any] = {} if not use_global_cache else QUERY_CACHE
-        self.request_count = 0
-        self.first_request: Optional[datetime] = None
-        self.last_request: Optional[datetime] = None
-
-    @property
-    def load(self) -> Optional[float]:
-        if self.first_request and self.last_request:
-            return (
-                self.request_count * 1000 / millis_interval(self.first_request, self.last_request)
-            )
-        return None
-
-    def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict:
-        """Request wrapper to handle adding common headers and logging
-
-        Args:
-            endpoint (string): api endpoint, excluding the base uri
-            method (str, optional): the http method. Defaults to 'GET'.
-
-        Returns:
-            dict: the json response as a python dict
-        """
-        url = join_url(self.url, endpoint)
-        self.request_count += 1
-        start_time = datetime.now()
-        if not self.first_request:
-            self.first_request = start_time
-        self.last_request = start_time
-        resp = requests.request(method, url, headers=self.headers, **kwargs)
-
-        if resp.status_code == 401 or resp.status_code == 403:
-            # try to re-login if the token expired
-
-            self.refresh_login()
-            self.request_count += 1
-            resp = requests.request(method, url, headers=self.headers, **kwargs)
-        timing = millis_interval(start_time, datetime.now())
-        logger.verbose(f'/{endpoint} - {resp.status_code} - {timing} ms')  # type: ignore
-
-        try:
-            resp.raise_for_status()
-        except requests.exceptions.HTTPError as err:
-            # try to get more error details
-            message = str(err)
-            try:
-                message += ' ' + resp.json()['message']
-            except Exception:
-                pass
-
-            raise requests.exceptions.HTTPError(message)
-
-        return resp.json()
-
-    def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict:
-        """Convenience method for making post requests"""
-        return self.request(uri, method='POST', data=json.dumps(data), **kwargs)
-
-    def login(self, username: str, password: str) -> None:
-        self.username = username
-        self.password = password
-
-        # use requests package directly to avoid recursion loop on login failure
-        self.request_count += 1
-        resp = requests.request(
-            url=f'{self.url}/token',
-            method='POST',
-            headers=self.headers,
-            data=json.dumps({'username': username, 'password': password}),
-        )
-        resp.raise_for_status()
-        content = resp.json()
-        self.token = content['kbToken']
-        self.headers['Authorization'] = self.token
-
-    def refresh_login(self) -> None:
-        self.login(self.username, self.password)
-
-    def set_cache_data(self, request_body: Dict, result: List[Record]) -> None:
-        """
-        Explicitly add a query to the cache
-        """
-        hash_code = cache_key(request_body)
-        self.cache[hash_code] = result
-
-    def query(
-        self,
-        request_body: Dict = {},
-        paginate: bool = True,
-        ignore_cache: bool = False,
-        force_refresh: bool = False,
-        limit: int = DEFAULT_LIMIT,
-    ) -> List[Record]:
-        """
-        Query GraphKB
-        """
-        result: List[Record] = []
-        hash_code = ""
-
-        if not ignore_cache and paginate:
-            hash_code = cache_key(request_body)
-            if hash_code in self.cache and not force_refresh:
-                return self.cache[hash_code]
-
-        while True:
-            content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)})
-            records = content['result']
-            result.extend(records)
-            if len(records) < limit or not paginate:
-                break
-
-        if not ignore_cache and paginate:
-            self.cache[hash_code] = result
-        return result
-
-    def parse(self, hgvs_string: str, requireFeatures: bool = False) -> ParsedVariant:
-        content = self.post(
-            'parse', data={'content': hgvs_string, 'requireFeatures': requireFeatures}
-        )
-        return cast(ParsedVariant, content['result'])
-
-    def get_records_by_id(self, record_ids: List[str]) -> List[Record]:
-        if not record_ids:
-            return []
-        result = self.query({'target': record_ids})
-        if len(record_ids) != len(result):
-            raise AssertionError(
-                f'The number of Ids given ({len(record_ids)}) does not match the number of records fetched ({len(result)})'
-            )
-        return result
-
-    def get_record_by_id(self, record_id: str) -> Record:
-        result = self.get_records_by_id([record_id])
-        return result[0]
-
-    def get_source(self, name: str) -> Record:
-        source = self.query({'target': 'Source', 'filters': {'name': name}})
-        if len(source) != 1:
-            raise AssertionError(f'Unable to unqiuely identify source with name {name}')
-        return source[0]
+from .constants import DEFAULT_URL
+from .util import GraphKBConnection, logger
diff --git a/graphkb/constants.py b/graphkb/constants.py
@@ -1,9 +1,16 @@
+import argparse
+
 from .types import CategoryBaseTermMapping
 
-BASE_RETURN_PROPERTIES = [
-    '@rid',
-    '@class',
-]
+DEFAULT_LIMIT = 1000
+
+GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
+GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
+GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
+DEFAULT_URL = GKB_BASE_URL
+
+
+BASE_RETURN_PROPERTIES = ['@rid', '@class']
 
 GENERIC_RETURN_PROPERTIES = [
     'name',
@@ -15,6 +22,47 @@
     'deprecated',
 ] + BASE_RETURN_PROPERTIES
 
+GENE_RETURN_PROPERTIES = ['biotype'] + GENERIC_RETURN_PROPERTIES
+
+VARIANT_RETURN_PROPERTIES = (
+    BASE_RETURN_PROPERTIES
+    + [f'type.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + [f'reference1.{p}' for p in GENE_RETURN_PROPERTIES]
+    + [f'reference2.{p}' for p in GENE_RETURN_PROPERTIES]
+    + ['zygosity', 'germline', 'displayName']
+)
+
+POS_VARIANT_RETURN_PROPERTIES = VARIANT_RETURN_PROPERTIES + [
+    'break1Start',
+    'break1End',
+    'break2Start',
+    'break2End',
+    'break1Repr',
+    'break2Repr',
+    'refSeq',
+    'untemplatedSeq',
+    'untemplatedSeqSize',
+    'truncation',
+    'assembly',
+]
+
+STATEMENT_RETURN_PROPERTIES = (
+    BASE_RETURN_PROPERTIES
+    + ['displayNameTemplate', 'sourceId', 'source.name', 'source.displayName']
+    + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]
+    + ['reviewStatus']
+)
+
+
+ONCOKB_SOURCE_NAME = 'oncokb'
+ONCOGENE = 'oncogenic'
+TUMOUR_SUPPRESSIVE = 'tumour suppressive'
+
+FUSION_NAMES = ['structural variant', 'fusion']
 BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
 # the order here is the order these are applied, the first category matched is returned
 RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
@@ -26,6 +74,7 @@
     ('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
 ]
 
+AMBIGUOUS_AA = ['x', '?', 'X']
 AA_3to1_MAPPING = {
     'Ala': 'A',
     'Arg': 'R',
@@ -51,3 +100,33 @@
     'Val': 'V',
     'Ter': '*',
 }
+
+
+class IterableNamespace(argparse.Namespace):
+    def __init__(self, *pos, **kwargs):
+        argparse.Namespace.__init__(self, *pos, **kwargs)
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def items(self):
+        return self.__dict__.items()
+
+    def values(self):
+        return self.__dict__.values()
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+
+INPUT_COPY_CATEGORIES = IterableNamespace(
+    AMP='amplification',
+    ANY_GAIN='copy gain',
+    ANY_LOSS='copy loss',
+    DEEP='deep deletion',
+    GAIN='low level copy gain',
+    LOSS='shallow deletion',
+)
+INPUT_EXPRESSION_CATEGORIES = IterableNamespace(
+    UP='increased expression', DOWN='reduced expression'
+)
diff --git a/graphkb/genes.py b/graphkb/genes.py
@@ -4,26 +4,15 @@
 from typing import Any, Dict, List, cast
 
 from . import GraphKBConnection
+from .constants import (
+    BASE_THERAPEUTIC_TERMS,
+    GENE_RETURN_PROPERTIES,
+    ONCOGENE,
+    ONCOKB_SOURCE_NAME,
+    TUMOUR_SUPPRESSIVE,
+)
 from .types import Ontology, Statement, Variant
-
-ONCOKB_SOURCE_NAME = 'oncokb'
-ONCOGENE = 'oncogenic'
-TUMOUR_SUPPRESSIVE = 'tumour suppressive'
-
-FUSION_NAMES = ['structural variant', 'fusion']
-
-GENE_RETURN_PROPERTIES = [
-    'name',
-    '@rid',
-    '@class',
-    'sourceId',
-    'sourceIdVersion',
-    'source.name',
-    'source.@rid',
-    'displayName',
-    'biotype',
-    'deprecated',
-]
+from .vocab import get_terms_set
 
 
 def _get_oncokb_gene_list(
@@ -81,6 +70,43 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     return _get_oncokb_gene_list(conn, TUMOUR_SUPPRESSIVE)
 
 
+def get_therapeutic_associated_genes(graphkb_conn: GraphKBConnection) -> List[Ontology]:
+    """Genes related to a cancer-associated statement in Graphkb."""
+    therapeutic_relevance = get_terms_set(graphkb_conn, BASE_THERAPEUTIC_TERMS)
+    statements = graphkb_conn.query(
+        {
+            'target': 'Statement',
+            'filters': {'relevance': sorted(list(therapeutic_relevance))},
+            'returnProperties': ['reviewStatus']
+            + [f'conditions.{prop}' for prop in GENE_RETURN_PROPERTIES]
+            + [
+                f'conditions.reference{ref}.{prop}'
+                for prop in GENE_RETURN_PROPERTIES
+                for ref in ('1', '2')
+            ],
+        }
+    )
+    genes: List[Ontology] = []
+    for statement in statements:
+        if statement['reviewStatus'] == 'failed':
+            continue
+        for condition in statement['conditions']:
+            if condition['@class'] == 'Feature':
+                genes.append(condition)
+            elif condition['@class'].endswith('Variant'):
+                cond = cast(Variant, condition)
+                if cond['reference1'] and cond['reference1']['@class'] == 'Feature':
+                    genes.append(cond['reference1'])
+                if cond['reference2'] and cond['reference2']['@class'] == 'Feature':
+                    genes.append(cond['reference2'])
+    unique_genes: List[Ontology] = []
+    for gene in genes:
+        if not gene.get('deprecated', False):
+            if gene['@rid'] not in [g['@rid'] for g in unique_genes]:
+                unique_genes.append(gene)
+    return unique_genes
+
+
 def get_genes_from_variant_types(
     conn: GraphKBConnection,
     types: List[str],