From 2ec82805d1587d2a4663edb9e0a15eab7a5bcf83 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 19 Oct 2022 14:18:42 -0700
Subject: [PATCH 01/32] SDEV-3256 - bugfix - get_equivalent_features - ignore
 versions in gene_names or transcripts for equivalent features.  Eg.
 NM_033360.4 should match 'NM_033360' and 'KRAS'.

---
 graphkb/match.py    | 11 ++++++++---
 tests/test_match.py | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/graphkb/match.py b/graphkb/match.py
index 8312e72..6e41bb3 100644
--- a/graphkb/match.py
+++ b/graphkb/match.py
@@ -12,7 +12,7 @@
     VARIANT_RETURN_PROPERTIES,
 )
 from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
-from .util import FeatureNotFoundError, convert_to_rid_list, looks_like_rid
+from .util import FeatureNotFoundError, convert_to_rid_list, logger, looks_like_rid
 from .vocab import get_term_tree
 
 FEATURES_CACHE: Set[str] = set()
@@ -62,14 +62,19 @@ def get_equivalent_features(
     if source:
         filters.append({'source': {'target': 'Source', 'filters': {'name': source}}})
 
+    if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric():
+        # eg. ENSG00000133703.11 or NM_033360.4
+        logger.debug(
+            f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features"
+        )
+        gene_name = gene_name.split('.')[0]
+
     if is_source_id or source_id_version:
         filters.append({'sourceId': gene_name})
-
         if source_id_version:
             filters.append(
                 {'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]}
             )
-
     elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache:
         return []
     else:
diff --git a/tests/test_match.py b/tests/test_match.py
index 6b9a929..5b969b0 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -47,6 +47,22 @@ def test_expands_generalizations(self, kras):
         assert 'NM_033360.4' in kras
         assert 'ENSG00000133703.11' in kras
 
+    def test_expands_generalizations_kras(self, kras):
+        assert 'NM_033360.4' in kras
+        assert 'NM_033360' in kras
+        assert 'ENSG00000133703.11' in kras
+        assert 'ENSG00000133703' in kras
+
+    @pytest.mark.parametrize(
+        'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703')
+    )
+    def test_expands_generalizations_refseq(self, alt_rep, conn):
+        kras = [f['displayName'] for f in match.get_equivalent_features(conn, alt_rep)]
+        assert 'NM_033360.4' in kras
+        assert 'NM_033360' in kras
+        assert 'ENSG00000133703.11' in kras
+        assert 'ENSG00000133703' in kras
+
 
 class TestMatchCopyVariant:
     def test_bad_category(self, conn):

From 294799683ee3ea24755891cdfe5e1fa51a47a243 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 19 Oct 2022 14:57:43 -0700
Subject: [PATCH 02/32] SDEV-3256 - new feature get_preferred_gene_name

---
 graphkb/match.py    | 37 +++++++++++++++++++++++++++++++++++--
 tests/test_match.py |  9 +++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/graphkb/match.py b/graphkb/match.py
index 6e41bb3..b542b0d 100644
--- a/graphkb/match.py
+++ b/graphkb/match.py
@@ -26,8 +26,7 @@ def get_equivalent_features(
     source: str = '',
     source_id_version: str = '',
 ) -> List[Ontology]:
-    """
-    Match an equivalent list of features given some input feature name (or ID)
+    """Match an equivalent list of features given some input feature name (or ID).
 
     Args:
         gene_name: the gene name to search features by
@@ -89,6 +88,40 @@ def get_equivalent_features(
     )
 
 
+def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str = '#39:5') -> str:
+    """Preferred gene symbol of a gene or transcript.
+
+    Args:
+        gene_name: the gene name to search features by
+        ignore_cache (bool, optional): bypass the cache to always force a new request
+        source: id of the preferred gene symbol source
+    Returns:
+        preferred displayName symbol.
+
+    Example:
+        return KRAS for get_preferred_gene_name(conn, 'NM_033360')
+        return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
+    """
+    eq = get_equivalent_features(conn=conn, gene_name=gene_name)
+    genes = [m for m in eq if m.get('biotype', '') == 'gene' and not m.get('deprecated', False)]
+    if not genes:
+        logger.error(f"No genes found for: {gene_name}")
+        return ''
+    if source:
+        source_filtered_genes = [m for m in genes if m.get('source', '') == source]
+        if not source_filtered_genes:
+            logger.error(f"No data from source {source} for {gene_name}")
+        else:
+            genes = source_filtered_genes
+
+    gene_names = [g['displayName'] for g in genes if g]
+    if len(gene_names) > 1:
+        logger.error(
+            f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
+        )
+    return gene_names[0]
+
+
 def cache_missing_features(conn: GraphKBConnection) -> None:
     """
     Create a cache of features that exist to avoid repeatedly querying
diff --git a/tests/test_match.py b/tests/test_match.py
index 5b969b0..4cd2ff0 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -63,6 +63,15 @@ def test_expands_generalizations_refseq(self, alt_rep, conn):
         assert 'ENSG00000133703.11' in kras
         assert 'ENSG00000133703' in kras
 
+    @pytest.mark.parametrize(
+        'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703')
+    )
+    def test_get_preferred_gene_name(self, alt_rep, conn):
+        gene_name = match.get_preferred_gene_name(conn, alt_rep)
+        assert (
+            'KRAS' == gene_name
+        ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'"
+
 
 class TestMatchCopyVariant:
     def test_bad_category(self, conn):

From efd3cc4d0497698e5e4d7a74aba54c87249a242a Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 19 Oct 2022 15:45:43 -0700
Subject: [PATCH 03/32] SDEV-3256 - add get_cancer_predisposition_info and
 get_pharmacogenomic_info to genes.py

---
 graphkb/constants.py |   4 +-
 graphkb/genes.py     | 221 ++++++++++++++++++++++++++++++++++++++++++-
 graphkb/match.py     |  34 -------
 graphkb/util.py      |  24 +++++
 tests/test_genes.py  |  99 +++++++++++++++++++
 tests/test_match.py  |   9 --
 6 files changed, 345 insertions(+), 46 deletions(-)

diff --git a/graphkb/constants.py b/graphkb/constants.py
index ffe89c5..ffa57e7 100644
--- a/graphkb/constants.py
+++ b/graphkb/constants.py
@@ -56,8 +56,10 @@
 ONCOKB_SOURCE_NAME = 'oncokb'
 ONCOGENE = 'oncogenic'
 TUMOUR_SUPPRESSIVE = 'tumour suppressive'
-
 FUSION_NAMES = ['structural variant', 'fusion']
+PHARMACOGENOMIC_RELEVANCE_TERMS = ["decreased toxicity", "increased toxicity"]
+PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]
+
 BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
 # the order here is the order these are applied, the first category matched is returned
 RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
diff --git a/graphkb/genes.py b/graphkb/genes.py
index 519dfbe..eb4f029 100644
--- a/graphkb/genes.py
+++ b/graphkb/genes.py
@@ -1,11 +1,21 @@
 """
 Methods for retrieving gene annotation lists from GraphKB
 """
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Tuple, cast
 
 from . import GraphKBConnection
-from .constants import GENE_RETURN_PROPERTIES, ONCOGENE, ONCOKB_SOURCE_NAME, TUMOUR_SUPPRESSIVE
+from .constants import (
+    GENE_RETURN_PROPERTIES,
+    ONCOGENE,
+    ONCOKB_SOURCE_NAME,
+    PHARMACOGENOMIC_RELEVANCE_TERMS,
+    PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
+    RELEVANCE_BASE_TERMS,
+    TUMOUR_SUPPRESSIVE,
+)
+from .match import get_equivalent_features
 from .types import Ontology, Statement, Variant
+from .util import get_rid, logger
 
 
 def _get_oncokb_gene_list(
@@ -117,3 +127,210 @@ def get_genes_from_variant_types(
         ),
     )
     return result
+
+
+def get_term_list(target_category) -> List[str]:
+    """Load the relevance terms for 'cancer predisposition' variants."""
+    for category, base_terms in RELEVANCE_BASE_TERMS:
+        if category == target_category and base_terms:
+            return base_terms
+    else:
+        raise AssertionError(f"Undefined '{target_category}' category")
+
+
+def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str = '#39:5') -> str:
+    """Preferred gene symbol of a gene or transcript.
+
+    Args:
+        gene_name: the gene name to search features by
+        ignore_cache (bool, optional): bypass the cache to always force a new request
+        source: id of the preferred gene symbol source
+    Returns:
+        preferred displayName symbol.
+
+    Example:
+        return KRAS for get_preferred_gene_name(conn, 'NM_033360')
+        return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
+    """
+    eq = get_equivalent_features(conn=conn, gene_name=gene_name)
+    genes = [m for m in eq if m.get('biotype', '') == 'gene' and not m.get('deprecated', False)]
+    if not genes:
+        logger.error(f"No genes found for: {gene_name}")
+        return ''
+    if source:
+        source_filtered_genes = [m for m in genes if m.get('source', '') == source]
+        if not source_filtered_genes:
+            logger.error(f"No data from source {source} for {gene_name}")
+        else:
+            genes = source_filtered_genes
+
+    gene_names = [g['displayName'] for g in genes if g]
+    if len(gene_names) > 1:
+        logger.error(
+            f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
+        )
+    return gene_names[0]
+
+
+def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants.
+
+    GERO-272 - criteria for what counts as a "cancer predisposition" variant
+
+    In short:
+    * Statement 'source' is 'CGL'
+    * Statement 'relevance' is 'pathogenic'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/155:11616
+
+    Returns:
+        genes: list of cancer predisposition genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    relevance = get_term_list("cancer predisposition")
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "evidence": {
+                        "target": "Source",
+                        "filters": {"@rid": get_rid(conn, "Source", "CGL")},
+                    },
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {
+                            "@rid": [get_rid(conn, "Vocabulary", term) for term in relevance]
+                        },
+                    },
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
+
+
+def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants.
+
+    SDEV-2733 - criteria for what counts as a "pharmacogenomic" variant
+
+    In short:
+    * Statement 'source' is not 'CGI' or 'CIViC'
+    * Statement 'relevance' is 'increased toxicity' or 'decreased toxicity'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/154:9574
+
+    Returns:
+        genes: list of pharmacogenomic genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {
+                            "@rid": [
+                                get_rid(conn, "Vocabulary", term)
+                                for term in PHARMACOGENOMIC_RELEVANCE_TERMS
+                            ]
+                        },
+                    }
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+                "source.name",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        if record["source"]:  # type: ignore
+            if record["source"]["name"].lower() in PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST:  # type: ignore
+                continue
+
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
diff --git a/graphkb/match.py b/graphkb/match.py
index b542b0d..7b81924 100644
--- a/graphkb/match.py
+++ b/graphkb/match.py
@@ -88,40 +88,6 @@ def get_equivalent_features(
     )
 
 
-def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str = '#39:5') -> str:
-    """Preferred gene symbol of a gene or transcript.
-
-    Args:
-        gene_name: the gene name to search features by
-        ignore_cache (bool, optional): bypass the cache to always force a new request
-        source: id of the preferred gene symbol source
-    Returns:
-        preferred displayName symbol.
-
-    Example:
-        return KRAS for get_preferred_gene_name(conn, 'NM_033360')
-        return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
-    """
-    eq = get_equivalent_features(conn=conn, gene_name=gene_name)
-    genes = [m for m in eq if m.get('biotype', '') == 'gene' and not m.get('deprecated', False)]
-    if not genes:
-        logger.error(f"No genes found for: {gene_name}")
-        return ''
-    if source:
-        source_filtered_genes = [m for m in genes if m.get('source', '') == source]
-        if not source_filtered_genes:
-            logger.error(f"No data from source {source} for {gene_name}")
-        else:
-            genes = source_filtered_genes
-
-    gene_names = [g['displayName'] for g in genes if g]
-    if len(gene_names) > 1:
-        logger.error(
-            f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
-        )
-    return gene_names[0]
-
-
 def cache_missing_features(conn: GraphKBConnection) -> None:
     """
     Create a cache of features that exist to avoid repeatedly querying
diff --git a/graphkb/util.py b/graphkb/util.py
index 9975a93..cce6862 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -248,3 +248,27 @@ def get_source(self, name: str) -> Record:
         if len(source) != 1:
             raise AssertionError(f'Unable to unqiuely identify source with name {name}')
         return source[0]
+
+
+def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:
+    """
+    Retrieve a record by name and target
+
+    Args:
+        conn: GraphKBConnection
+        target: record type to query
+        name: the name of the record to retrieve
+
+    Returns:
+        str: @rid of the record
+
+    Raises:
+        AssertionError: if the term was not found or more than 1 match was found (expected to be unique)
+    """
+    result = conn.query(
+        {"target": target, "filters": {"name": name}, "returnProperties": ["@rid"]},
+        ignore_cache=False,
+    )
+    assert len(result) == 1, f"unable to find unique '{target}' ID for '{name}'"
+
+    return result[0]["@rid"]
diff --git a/tests/test_genes.py b/tests/test_genes.py
index 8f66020..59e24c2 100644
--- a/tests/test_genes.py
+++ b/tests/test_genes.py
@@ -8,14 +8,86 @@
 from graphkb import GraphKBConnection
 from graphkb.constants import FUSION_NAMES
 from graphkb.genes import (
+    get_cancer_predisposition_info,
     get_genes_from_variant_types,
     get_oncokb_oncogenes,
     get_oncokb_tumour_supressors,
+    get_pharmacogenomic_info,
+    get_preferred_gene_name,
 )
 
 CANONICAL_ONCOGENES = ['kras', 'nras', 'alk']
 CANONICAL_TS = ['cdkn2a', 'tp53']
 CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1']
+PHARMACOGENOMIC_INITIAL_GENES = [
+    'ACYP2',
+    'CEP72',
+    # 'CYP26B1',  # defined as hgvsGenomic chr2:g.233760235_233760235nc_000002.12:g.233760235ta[7]>ta[8]
+    'DPYD',
+    'NUDT15',
+    'RARG',
+    'SLC28A3',
+    'TPMT',
+    'UGT1A6',
+]
+CANCER_PREDISP_INITIAL_GENES = [
+    'AKT1',
+    'APC',
+    'ATM',
+    'AXIN2',
+    'BAP1',
+    'BLM',
+    'BMPR1A',
+    'BRCA1',
+    'BRCA2',
+    'BRIP1',
+    'CBL',
+    'CDH1',
+    'CDK4',
+    'CDKN2A',
+    'CHEK2',
+    'DICER1',
+    'EGFR',
+    'EPCAM',
+    'ETV6',
+    'EZH2',
+    'FH',
+    'FLCN',
+    'GATA2',
+    'HRAS',
+    'KIT',
+    'MEN1',
+    'MET',
+    'MLH1',
+    'MSH2',
+    'MSH6',
+    'MUTYH',
+    'NBN',
+    'NF1',
+    'PALB2',
+    'PDGFRA',
+    'PMS2',
+    'PTCH1',
+    'PTEN',
+    'PTPN11',
+    'RAD51C',
+    'RAD51D',
+    'RB1',
+    'RET',
+    'RUNX1',
+    'SDHA',
+    'SDHB',
+    'SDHC',
+    'SDHD',
+    'SMAD4',
+    'SMARCA4',
+    'STK11',
+    'TP53',
+    'TSC1',
+    'TSC2',
+    'VHL',
+    'WT1',
+]
 
 
 @pytest.fixture(scope='module')
@@ -57,6 +129,33 @@ def test_finds_ts(conn, gene):
     assert gene in names
 
 
+def test_get_pharmacogenomic_info(conn):
+    genes, matches = get_pharmacogenomic_info(conn)
+    for gene in PHARMACOGENOMIC_INITIAL_GENES:
+        assert gene in genes, f"{gene} not found in get_pharmacogenomic_info"
+        for rid, variant_display in matches.items():
+            if variant_display.startswith(gene):
+                break
+        else:  # no break called
+            assert False, f"No rid found for a pharmacogenomic with {gene}"
+
+
+def test_get_cancer_predisposition_info(conn):
+    genes, matches = get_cancer_predisposition_info(conn)
+    for gene in CANCER_PREDISP_INITIAL_GENES:
+        assert gene in genes, f"{gene} not found in get_cancer_predisposition_info"
+
+
+@pytest.mark.parametrize(
+    'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703')
+)
+def test_get_preferred_gene_name_kras(alt_rep, conn):
+    gene_name = get_preferred_gene_name(conn, alt_rep)
+    assert (
+        'KRAS' == gene_name
+    ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'"
+
+
 @pytest.mark.parametrize('gene', CANONICAL_FUSION_GENES)
 def test_find_fusion_genes(conn, gene):
     result = get_genes_from_variant_types(conn, FUSION_NAMES)
diff --git a/tests/test_match.py b/tests/test_match.py
index 4cd2ff0..5b969b0 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -63,15 +63,6 @@ def test_expands_generalizations_refseq(self, alt_rep, conn):
         assert 'ENSG00000133703.11' in kras
         assert 'ENSG00000133703' in kras
 
-    @pytest.mark.parametrize(
-        'alt_rep', ('NM_033360.4', 'NM_033360', 'ENSG00000133703.11', 'ENSG00000133703')
-    )
-    def test_get_preferred_gene_name(self, alt_rep, conn):
-        gene_name = match.get_preferred_gene_name(conn, alt_rep)
-        assert (
-            'KRAS' == gene_name
-        ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'"
-
 
 class TestMatchCopyVariant:
     def test_bad_category(self, conn):

From 5a74d561a436c595bbb094c8c1e0c7e93beee8a6 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Thu, 27 Oct 2022 14:44:37 -0700
Subject: [PATCH 04/32] SDEV-3256 - simplify and improve generalizablity of
 get_pharmacogenomic_info and get_cancer_predisposition_info.

---
 graphkb/constants.py |  2 +-
 graphkb/genes.py     | 28 ++++++++++++----------------
 graphkb/vocab.py     |  6 ++----
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/graphkb/constants.py b/graphkb/constants.py
index 912c010..2edf6b0 100644
--- a/graphkb/constants.py
+++ b/graphkb/constants.py
@@ -62,7 +62,7 @@
 ONCOGENE = 'oncogenic'
 TUMOUR_SUPPRESSIVE = 'tumour suppressive'
 FUSION_NAMES = ['structural variant', 'fusion']
-PHARMACOGENOMIC_RELEVANCE_TERMS = ["decreased toxicity", "increased toxicity"]
+
 PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]
 
 BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
diff --git a/graphkb/genes.py b/graphkb/genes.py
index 4217603..7dbf981 100644
--- a/graphkb/genes.py
+++ b/graphkb/genes.py
@@ -9,7 +9,6 @@
     GENE_RETURN_PROPERTIES,
     ONCOGENE,
     ONCOKB_SOURCE_NAME,
-    PHARMACOGENOMIC_RELEVANCE_TERMS,
     PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
     RELEVANCE_BASE_TERMS,
     TUMOUR_SUPPRESSIVE,
@@ -50,8 +49,7 @@ def _get_oncokb_gene_list(
 
 
 def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of oncogenes stored in GraphKB derived from OncoKB
+    """Gets the list of oncogenes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -63,8 +61,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
 
 
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB
+    """Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -191,6 +188,10 @@ def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str
         return KRAS for get_preferred_gene_name(conn, 'NM_033360')
         return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
     """
+    CHROMOSOMES = [f"chr{i}" for i in range(1, 24)] + ['chrX', 'chrY']
+    if gene_name in CHROMOSOMES:
+        logger.error(f"{gene_name} assumed to be a chromosome, not gene")
+        return ''
     eq = get_equivalent_features(conn=conn, gene_name=gene_name)
     genes = [m for m in eq if m.get('biotype', '') == 'gene' and not m.get('deprecated', False)]
     if not genes:
@@ -233,7 +234,7 @@ def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str],
     infer_genes = set()
     variants = {}
 
-    relevance = get_term_list("cancer predisposition")
+    relevance_rids = list(get_terms_set(conn, "cancer predisposition"))
 
     for record in conn.query(
         {
@@ -246,9 +247,7 @@ def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str],
                     },
                     "relevance": {
                         "target": "Vocabulary",
-                        "filters": {
-                            "@rid": [get_rid(conn, "Vocabulary", term) for term in relevance]
-                        },
+                        "filters": {"@rid": relevance_rids},
                     },
                 }
             ],
@@ -314,6 +313,8 @@ def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[s
     infer_genes = set()
     variants = {}
 
+    relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))
+
     for record in conn.query(
         {
             "target": "Statement",
@@ -321,13 +322,8 @@ def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[s
                 {
                     "relevance": {
                         "target": "Vocabulary",
-                        "filters": {
-                            "@rid": [
-                                get_rid(conn, "Vocabulary", term)
-                                for term in PHARMACOGENOMIC_RELEVANCE_TERMS
-                            ]
-                        },
-                    }
+                        "filters": {"@rid": relevance_rids},
+                    },
                 }
             ],
             "returnProperties": [
diff --git a/graphkb/vocab.py b/graphkb/vocab.py
index 1e0199e..94fbdf9 100644
--- a/graphkb/vocab.py
+++ b/graphkb/vocab.py
@@ -187,10 +187,8 @@ def get_term_by_name(
 def get_terms_set(
     graphkb_conn: GraphKBConnection, base_terms: Iterable[str], ignore_cache: bool = False
 ) -> Set[str]:
-    """
-    Get a set of terms of vocabulary given some base/parent term names. Returns the record
-    IDs for the resulting terms
-    """
+    """Get a set of vocabulary rids given some base/parent term names."""
+    base_terms = [base_terms] if isinstance(base_terms, str) else base_terms
     cache_key = tuple(sorted(base_terms))
     if graphkb_conn.cache.get(cache_key, None) and not ignore_cache:
         return graphkb_conn.cache[cache_key]

From e5db64d2af619b045fd84e5382d36330b95423b1 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Thu, 27 Oct 2022 14:47:53 -0700
Subject: [PATCH 05/32] SDEV-3256 - remove unused functions and constant.

---
 graphkb/genes.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/graphkb/genes.py b/graphkb/genes.py
index 7dbf981..08648d1 100644
--- a/graphkb/genes.py
+++ b/graphkb/genes.py
@@ -10,7 +10,6 @@
     ONCOGENE,
     ONCOKB_SOURCE_NAME,
     PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
-    RELEVANCE_BASE_TERMS,
     TUMOUR_SUPPRESSIVE,
 )
 from .match import get_equivalent_features
@@ -165,15 +164,6 @@ def get_genes_from_variant_types(
     return result
 
 
-def get_term_list(target_category) -> List[str]:
-    """Load the relevance terms for 'cancer predisposition' variants."""
-    for category, base_terms in RELEVANCE_BASE_TERMS:
-        if category == target_category and base_terms:
-            return base_terms
-    else:
-        raise AssertionError(f"Undefined '{target_category}' category")
-
-
 def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str = '#39:5') -> str:
     """Preferred gene symbol of a gene or transcript.
 

From a23d6a02dda1680c72e1c96760a131c4fc47c533 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 16 Nov 2022 10:44:37 -0800
Subject: [PATCH 06/32] PR - move CHOMOSOMES and PREFERRED_GENE_SOURCE to
 constants.py

---
 graphkb/constants.py | 6 +++++-
 graphkb/genes.py     | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/graphkb/constants.py b/graphkb/constants.py
index 2edf6b0..00e1845 100644
--- a/graphkb/constants.py
+++ b/graphkb/constants.py
@@ -3,12 +3,12 @@
 from .types import CategoryBaseTermMapping
 
 DEFAULT_LIMIT = 1000
-
 GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
 GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
 GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
 DEFAULT_URL = GKB_BASE_URL
 
+PREFERRED_GENE_SOURCE = "#39:5"  # HGNC
 
 BASE_RETURN_PROPERTIES = ['@rid', '@class']
 
@@ -76,6 +76,10 @@
     ('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
 ]
 
+CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
+CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt']
+CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19
+
 AMBIGUOUS_AA = ['x', '?', 'X']
 AA_3to1_MAPPING = {
     'Ala': 'A',
diff --git a/graphkb/genes.py b/graphkb/genes.py
index 08648d1..4a696e8 100644
--- a/graphkb/genes.py
+++ b/graphkb/genes.py
@@ -6,10 +6,12 @@
 from . import GraphKBConnection
 from .constants import (
     BASE_THERAPEUTIC_TERMS,
+    CHROMOSOMES,
     GENE_RETURN_PROPERTIES,
     ONCOGENE,
     ONCOKB_SOURCE_NAME,
     PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
+    PREFERRED_GENE_SOURCE,
     TUMOUR_SUPPRESSIVE,
 )
 from .match import get_equivalent_features
@@ -164,7 +166,9 @@ def get_genes_from_variant_types(
     return result
 
 
-def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str = '#39:5') -> str:
+def get_preferred_gene_name(
+    conn: GraphKBConnection, gene_name: str, source: str = PREFERRED_GENE_SOURCE
+) -> str:
     """Preferred gene symbol of a gene or transcript.
 
     Args:
@@ -178,7 +182,6 @@ def get_preferred_gene_name(conn: GraphKBConnection, gene_name: str, source: str
         return KRAS for get_preferred_gene_name(conn, 'NM_033360')
         return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
     """
-    CHROMOSOMES = [f"chr{i}" for i in range(1, 24)] + ['chrX', 'chrY']
     if gene_name in CHROMOSOMES:
         logger.error(f"{gene_name} assumed to be a chromosome, not gene")
         return ''

From 07cebb53c8223f2befae043a39643109503c0d8a Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Thu, 17 Nov 2022 13:27:05 -0800
Subject: [PATCH 07/32] SDEV-3342 - add get_statements_from_variants function -
 adapted from pori_ipr_python.annotate.py

---
 graphkb/constants.py    |  1 +
 graphkb/statement.py    | 34 ++++++++++++++++++++++++++++++++--
 tests/test_statement.py | 13 +++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/graphkb/constants.py b/graphkb/constants.py
index 041c65c..bf7cad9 100644
--- a/graphkb/constants.py
+++ b/graphkb/constants.py
@@ -73,6 +73,7 @@
     ('cancer predisposition', ['pathogenic']),
     ('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
 ]
+FAILED_REVIEW_STATUS = 'failed'
 
 AMBIGUOUS_AA = ['x', '?', 'X']
 AA_3to1_MAPPING = {
diff --git a/graphkb/statement.py b/graphkb/statement.py
index 70fe87c..01bcd63 100644
--- a/graphkb/statement.py
+++ b/graphkb/statement.py
@@ -1,6 +1,9 @@
+from typing import List, cast
+
 from . import GraphKBConnection
-from .constants import RELEVANCE_BASE_TERMS
-from .types import CategoryBaseTermMapping
+from .constants import FAILED_REVIEW_STATUS, RELEVANCE_BASE_TERMS, STATEMENT_RETURN_PROPERTIES
+from .types import CategoryBaseTermMapping, Statement, Variant
+from .util import convert_to_rid_list
 from .vocab import get_terms_set
 
 
@@ -17,3 +20,30 @@ def categorize_relevance(
         if relevance_rid in term_set:
             return category
     return ''
+
+
+def get_statements_from_variants(
+    graphkb_conn: GraphKBConnection,
+    variants: List[Variant],
+    failed_review: bool = False,
+) -> List[Statement]:
+    """Given a list of variant records from GraphKB, return related statements.
+
+    Args:
+        graphkb_conn (GraphKBConnection): the graphkb api connection object
+        variants (list.<dict>): list of variant records. (Have @rid property.)
+        failed_review (bool): Include statements that failed review
+
+    Returns:
+        list.<dict>: list of Statement records from graphkb
+    """
+    statements = graphkb_conn.query(
+        {
+            'target': 'Statement',
+            'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'},
+            'returnProperties': STATEMENT_RETURN_PROPERTIES,
+        }
+    )
+    if not failed_review:
+        statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS]
+    return [cast(Statement, s) for s in statements]
diff --git a/tests/test_statement.py b/tests/test_statement.py
index 4f2da9c..6562eef 100644
--- a/tests/test_statement.py
+++ b/tests/test_statement.py
@@ -4,6 +4,8 @@
 
 from graphkb import statement
 
+from .test_match import conn
+
 
 @pytest.fixture()
 def graphkb_conn():
@@ -79,3 +81,14 @@ def test_custom_categories(self, graphkb_conn):
             graphkb_conn, '1', [('blargh', ['some', 'blargh'])]
         )
         assert category == 'blargh'
+
+
+class TestStatementMatch:
+    def test_truncating_categories(self, conn):
+        variant = {
+            '@class': 'CategoryVariant',
+            '@rid': '#161:429',
+            'displayName': 'RB1 truncating',
+        }
+        statements = statement.get_statements_from_variants(conn, [variant])
+        assert statements

From 051283013bc78299e6ca26e60ab983d014b8e76c Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Fri, 2 Dec 2022 12:14:31 -0800
Subject: [PATCH 08/32] PR 72 - style suggestion - simplify get commands.

---
 graphkb/genes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graphkb/genes.py b/graphkb/genes.py
index 4a696e8..93bc62a 100644
--- a/graphkb/genes.py
+++ b/graphkb/genes.py
@@ -186,12 +186,12 @@ def get_preferred_gene_name(
         logger.error(f"{gene_name} assumed to be a chromosome, not gene")
         return ''
     eq = get_equivalent_features(conn=conn, gene_name=gene_name)
-    genes = [m for m in eq if m.get('biotype', '') == 'gene' and not m.get('deprecated', False)]
+    genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')]
     if not genes:
         logger.error(f"No genes found for: {gene_name}")
         return ''
     if source:
-        source_filtered_genes = [m for m in genes if m.get('source', '') == source]
+        source_filtered_genes = [m for m in genes if m.get('source') == source]
         if not source_filtered_genes:
             logger.error(f"No data from source {source} for {gene_name}")
         else:

From fe5002e4b2bc37de362a3ee714ce32965c1dc8fb Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Fri, 9 Dec 2022 15:53:37 -0800
Subject: [PATCH 09/32] Release v1.9.0 New Features:  -
 get_statements_from_variants - helper function added.  - get_term_list -
 helper function added.  - get_rid - helper function added.  -
 get_pharmacogenomic_info - helper function added.  -
 get_cancer_predisposition_info - helper function added.  - Added constants
 PHARMACOGENOMIC_RELEVANCE_TERMS and PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST
 Bugfix:  - Ignore gene name version in get_equivalent_features. Eg.
 NM_033360.4 should match 'NM_033360' and 'KRAS'. Improvements:  -
 get_preferred_gene_name - moved to genes.py  - move CHOMOSOMES and
 PREFERRED_GENE_SOURCE to constants.py  - added tests:    -
 test_get_pharmacogenomic_info    - test_get_cancer_predisposition_info Github
 Workflow Updates:  - Drop Python 3.6 testing  - actions/checkout@v3  -
 actions/setup-python@v3

---
 .github/workflows/pytest.yml | 6 +++---
 setup.cfg                    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index e36815f..eab849e 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -11,12 +11,12 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.7', '3.8', '3.9', '3.10']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/setup.cfg b/setup.cfg
index a4b85ee..9b25c96 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ include_trailing_comma = true
 [metadata]
 name = graphkb
 url = https://github.com/bcgsc/pori_graphkb_python
-version = 1.8.0
+version = 1.9.0
 author_email = graphkb@bcgsc.ca
 description = python adapter for interacting with the GraphKB API
 long_description = file: README.md

From 636ada7820b73c0d80a819dab30b49bf743ba7c6 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 18 Jan 2023 10:18:53 -0800
Subject: [PATCH 10/32] github actions update codecov/codecov-action@v1 to
 codecov/codecov-action@v3

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index eab849e..121b588 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -45,7 +45,7 @@ jobs:
         # Use always() to always run this step to publish test results when there are test failures
       if: always()
     - name: Update code coverage report to CodeCov
-      uses: codecov/codecov-action@v1
+      uses: codecov/codecov-action@v3
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         file: ./coverage.xml

From 1d3688271cf7856d41fd69e25ae7c9b164b2390e Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 11:04:57 -0800
Subject: [PATCH 11/32] SDEV-3381 - github workflow fixes - add
 EXCLUDE_INTEGRATION_TESTS option to test_genes::test_find_fusion_genes.

---
 .github/workflows/pytest.yml | 1 +
 tests/test_genes.py          | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 121b588..3cf98d0 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -37,6 +37,7 @@ jobs:
       env:
         GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }}
         GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }}
+        EXCLUDE_INTEGRATION_TESTS: 1
     - name: Upload pytest test results
       uses: actions/upload-artifact@master
       with:
diff --git a/tests/test_genes.py b/tests/test_genes.py
index 591120e..86882f4 100644
--- a/tests/test_genes.py
+++ b/tests/test_genes.py
@@ -17,6 +17,8 @@
     get_therapeutic_associated_genes,
 )
 
+EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1'
+
 CANONICAL_ONCOGENES = ['kras', 'nras', 'alk']
 CANONICAL_TS = ['cdkn2a', 'tp53']
 CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1']
@@ -145,6 +147,7 @@ def test_get_preferred_gene_name_kras(alt_rep, conn):
     ), f"Expected KRAS as preferred gene name for {alt_rep}, not '{gene_name}'"
 
 
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
 def test_find_fusion_genes(conn):
     result = get_genes_from_variant_types(conn, FUSION_NAMES)
     names = {row['name'] for row in result}

From 3d11c18a22dc5c1d8cf83541e740e4a8719eb7ed Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 12:14:54 -0800
Subject: [PATCH 12/32] SDEV-3381 - github workflow fixes -
 EXCLUDE_INTEGRATION_TESTS for test_truncating_categories and
 test_known_variants.

---
 tests/test_match.py     | 3 +++
 tests/test_statement.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tests/test_match.py b/tests/test_match.py
index 8e81dcd..185082c 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -8,6 +8,8 @@
 from graphkb import GraphKBConnection, match
 from graphkb.util import FeatureNotFoundError
 
+EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1'
+
 INCREASE_PREFIXES = ['up', 'increase', 'over', 'gain', 'amp']
 DECREASE_PREFIXES = ['down', 'decrease', 'reduce', 'under', 'loss', 'delet']
 GENERAL_MUTATION = 'mutation'
@@ -364,6 +366,7 @@ def test_match_explicit_references(self, conn):
         )
         assert matches
 
+    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
     @pytest.mark.parametrize(
         'known_variant,related_variants,unrelated_variants',
         [
diff --git a/tests/test_statement.py b/tests/test_statement.py
index 6562eef..ef232ba 100644
--- a/tests/test_statement.py
+++ b/tests/test_statement.py
@@ -1,11 +1,13 @@
 from unittest.mock import Mock
 
+import os
 import pytest
 
 from graphkb import statement
 
 from .test_match import conn
 
+EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1'
 
 @pytest.fixture()
 def graphkb_conn():
@@ -83,6 +85,7 @@ def test_custom_categories(self, graphkb_conn):
         assert category == 'blargh'
 
 
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
 class TestStatementMatch:
     def test_truncating_categories(self, conn):
         variant = {

From 40504f3377eb1e14a36eeceea301ac0f30ff853c Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 12:48:39 -0800
Subject: [PATCH 13/32] SDEV-3381 - lint w black

---
 tests/test_match.py     | 4 +++-
 tests/test_statement.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_match.py b/tests/test_match.py
index 185082c..b9c9719 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -366,7 +366,9 @@ def test_match_explicit_references(self, conn):
         )
         assert matches
 
-    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     @pytest.mark.parametrize(
         'known_variant,related_variants,unrelated_variants',
         [
diff --git a/tests/test_statement.py b/tests/test_statement.py
index ef232ba..6d7a01d 100644
--- a/tests/test_statement.py
+++ b/tests/test_statement.py
@@ -9,6 +9,7 @@
 
 EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1'
 
+
 @pytest.fixture()
 def graphkb_conn():
     def make_rid_list(*values):

From 11986e0bbc64072ec8996a509430fcfabce351bb Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 14:11:46 -0800
Subject: [PATCH 14/32] SDEV-3381 - switch default python to 3.9

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 3cf98d0..eb8445a 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -54,4 +54,4 @@ jobs:
         env_vars: OS,PYTHON
         name: codecov-umbrella
         fail_ci_if_error: true
-      if: matrix.python-version == 3.8
+      if: matrix.python-version == 3.9

From 5ceb2253ebef68dff681e284f4a8db4f185ac017 Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 14:48:27 -0800
Subject: [PATCH 15/32] SDEV-3381 - try excluding more tests -
 test_get_cancer_predisposition_info, test_match_explicit_references and
 test_genomic_coordinates.

---
 tests/test_genes.py | 1 +
 tests/test_match.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tests/test_genes.py b/tests/test_genes.py
index 86882f4..60bcc2d 100644
--- a/tests/test_genes.py
+++ b/tests/test_genes.py
@@ -131,6 +131,7 @@ def test_get_pharmacogenomic_info(conn):
             assert False, f"No rid found for a pharmacogenomic with {gene}"
 
 
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
 def test_get_cancer_predisposition_info(conn):
     genes, matches = get_cancer_predisposition_info(conn)
     for gene in CANCER_PREDISP_INITIAL_GENES:
diff --git a/tests/test_match.py b/tests/test_match.py
index b9c9719..89ba4b9 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -358,6 +358,7 @@ def test_match_explicit_reference1(self, conn):
         matches = match.match_positional_variant(conn, 'p.G12D', reference1=reference1)
         assert matches
 
+    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
     def test_match_explicit_references(self, conn):
         reference1 = conn.query({'target': 'Feature', 'filters': {'name': 'BCR'}})[0]['@rid']
         reference2 = conn.query({'target': 'Feature', 'filters': {'name': 'ABL1'}})[0]['@rid']
@@ -418,6 +419,7 @@ def test_novel_specific_matches_general(self, conn):
         assert novel_specific not in names
         assert 'CDKN2A mutation' in names
 
+    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
     def test_genomic_coordinates(self, conn):
         genomic = 'X:g.100611165A>T'
         match.match_positional_variant(conn, genomic)

From 635cee0a0e174567fee6dc1a27a3a246bcab68a9 Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 14:55:33 -0800
Subject: [PATCH 16/32] SDEV-3381 - lint w black

---
 tests/test_match.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_match.py b/tests/test_match.py
index 89ba4b9..112286b 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -358,7 +358,9 @@ def test_match_explicit_reference1(self, conn):
         matches = match.match_positional_variant(conn, 'p.G12D', reference1=reference1)
         assert matches
 
-    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_match_explicit_references(self, conn):
         reference1 = conn.query({'target': 'Feature', 'filters': {'name': 'BCR'}})[0]['@rid']
         reference2 = conn.query({'target': 'Feature', 'filters': {'name': 'ABL1'}})[0]['@rid']
@@ -419,7 +421,9 @@ def test_novel_specific_matches_general(self, conn):
         assert novel_specific not in names
         assert 'CDKN2A mutation' in names
 
-    @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_genomic_coordinates(self, conn):
         genomic = 'X:g.100611165A>T'
         match.match_positional_variant(conn, genomic)

From 47d0dbb847408dcb8153619c809d25a5a65df614 Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 15:29:49 -0800
Subject: [PATCH 17/32] SDEV-3381 - try excluding more tests -
 test_get_therapeutic_associated_genes

---
 tests/test_genes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_genes.py b/tests/test_genes.py
index 60bcc2d..379c5e9 100644
--- a/tests/test_genes.py
+++ b/tests/test_genes.py
@@ -156,6 +156,7 @@ def test_find_fusion_genes(conn):
         assert gene in names, f"{gene} was not identified as a fusion gene."
 
 
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
 def test_get_therapeutic_associated_genes(conn):
     gene_list = get_therapeutic_associated_genes(graphkb_conn=conn)
     assert gene_list, 'No get_therapeutic_associated_genes found'

From 276011e40a02c1e2a0cb80d0f19e42f4d63c841b Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 16:22:23 -0800
Subject: [PATCH 18/32] SDEV-3381 - try excluding more tests -
 test_low_gain_excludes_amplification test_known_reduced_expression.

---
 tests/test_match.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_match.py b/tests/test_match.py
index 112286b..7849ee1 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -122,6 +122,9 @@ def test_known_gain(self, conn):
         for variant_type in types_selected:
             assert not has_prefix(variant_type, DECREASE_PREFIXES)
 
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_low_gain_excludes_amplification(self, conn):
         matches = match.match_copy_variant(conn, 'KRAS', match.INPUT_COPY_CATEGORIES.GAIN)
 
@@ -168,6 +171,9 @@ def test_bad_gene_name(self, conn):
                 conn, 'not a real gene name', match.INPUT_EXPRESSION_CATEGORIES.UP
             )
 
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_known_reduced_expression(self, conn):
         matches = match.match_expression_variant(
             conn, 'PTEN', match.INPUT_EXPRESSION_CATEGORIES.DOWN

From 191c07dd1db45694d7c0a36d060d8ed24c4ef726 Mon Sep 17 00:00:00 2001
From: dustinbleile <dustinbleile@gmail.com>
Date: Thu, 19 Jan 2023 16:51:44 -0800
Subject: [PATCH 19/32] SDEV-3381 - try excluding more tests for python 3.9
 test_known_increased_expression

---
 tests/test_match.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_match.py b/tests/test_match.py
index 7849ee1..58f9f1a 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -203,6 +203,9 @@ def test_known_reduced_expression_gene_id(self, conn):
         for variant_type in types_selected:
             assert not has_prefix(variant_type, INCREASE_PREFIXES)
 
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_known_increased_expression(self, conn):
         matches = match.match_expression_variant(conn, 'CA9', match.INPUT_EXPRESSION_CATEGORIES.UP)
         assert matches

From f2df1a5a3cabcc7cd39aba7fc3c4cb28e3888a60 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Fri, 20 Jan 2023 09:47:56 -0800
Subject: [PATCH 20/32] SDEV-3381 - try excluding more tests for python 3.8 -
 test_tert_promoter

---
 tests/test_match.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_match.py b/tests/test_match.py
index 58f9f1a..c70ca1e 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -438,6 +438,9 @@ def test_genomic_coordinates(self, conn):
         match.match_positional_variant(conn, genomic)
         # no assert b/c checking for no error rather than the result
 
+    @pytest.mark.skipif(
+        EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    )
     def test_tert_promoter(self, conn):
         assert match.match_positional_variant(conn, 'TERT:c.-124C>T')
 

From d8b68aef911e9d67953381c68cbe7225a4c8d06d Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 24 Jan 2023 15:35:02 -0800
Subject: [PATCH 21/32] bump up retry settings

---
 graphkb/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphkb/util.py b/graphkb/util.py
index cce6862..62cc472 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -100,7 +100,7 @@ def __init__(
         use_global_cache: bool = True,
     ):
         self.http = requests.Session()
-        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
+        retries = Retry(total=30, backoff_factor=10, status_forcelist=[429, 500, 502, 503, 504])
         self.http.mount("https://", HTTPAdapter(max_retries=retries))
 
         self.token = ''

From 226f2486ad300a42ed9274c92f1fe43b912ed43d Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 24 Jan 2023 15:40:07 -0800
Subject: [PATCH 22/32] remove 3.6 from workflow

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index e36815f..282f866 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.7', '3.8', '3.9', '3.10']
 
     steps:
     - uses: actions/checkout@v2

From d974dc53004a3c326bddf16fe0f935258fd60706 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Tue, 24 Jan 2023 15:43:57 -0800
Subject: [PATCH 23/32] SDEV-3381 - increasing Retry(total=6,
 backoff_factor=30)

---
 graphkb/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphkb/util.py b/graphkb/util.py
index cce6862..883f25c 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -100,7 +100,7 @@ def __init__(
         use_global_cache: bool = True,
     ):
         self.http = requests.Session()
-        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
+        retries = Retry(total=6, backoff_factor=30, status_forcelist=[429, 500, 502, 503, 504])
         self.http.mount("https://", HTTPAdapter(max_retries=retries))
 
         self.token = ''

From 42cc582fcd2e16e05c2e950d91142a81d4dbbb3f Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Feb 2023 15:59:23 -0800
Subject: [PATCH 24/32] add retries on connection failure

---
 .github/workflows/pytest.yml |  5 +-
 graphkb/util.py              | 99 ++++++++++++++++++++++++++++++------
 setup.cfg                    |  9 +++-
 3 files changed, 94 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index eb8445a..f5ccd00 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -10,6 +10,7 @@ jobs:
 
     runs-on: ubuntu-latest
     strategy:
+      max-parallel: 1
       matrix:
         python-version: ['3.7', '3.8', '3.9', '3.10']
 
@@ -33,7 +34,9 @@ jobs:
         pip install black
         black --check -S -l 100 graphkb tests
     - name: Test with pytest
-      run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml
+      run: |
+        pip list
+        pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml --durations 10 -vv
       env:
         GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }}
         GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }}
diff --git a/graphkb/util.py b/graphkb/util.py
index 883f25c..4af5fe7 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import re
+import time
 from datetime import datetime
 from typing import Any, Dict, Iterable, List, Optional, cast
 
@@ -100,7 +101,13 @@ def __init__(
         use_global_cache: bool = True,
     ):
         self.http = requests.Session()
-        retries = Retry(total=6, backoff_factor=30, status_forcelist=[429, 500, 502, 503, 504])
+        retries = Retry(
+            total=100,
+            connect=5,
+            status=5,
+            backoff_factor=5,
+            status_forcelist=[429, 500, 502, 503, 504],
+        )
         self.http.mount("https://", HTTPAdapter(max_retries=retries))
 
         self.token = ''
@@ -123,7 +130,12 @@ def load(self) -> Optional[float]:
             )
         return None
 
-    def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict:
+    def request(
+        self,
+        endpoint: str,
+        method: str = 'GET',
+        **kwargs,
+    ) -> Dict:
         """Request wrapper to handle adding common headers and logging.
 
         Args:
@@ -135,18 +147,54 @@ def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict:
         """
         url = join_url(self.url, endpoint)
         self.request_count += 1
+
+        # don't want to use a read timeout if the request is not idempotent
+        # otherwise you may wind up making unintended changes
+        timeout = None
+        if endpoint in ['query', 'parse']:
+            timeout = (connect_timeout = 7, read_timeout = 61)
+
         start_time = datetime.now()
+
         if not self.first_request:
             self.first_request = start_time
         self.last_request = start_time
-        resp = requests.request(method, url, headers=self.headers, **kwargs)
 
-        if resp.status_code == 401 or resp.status_code == 403:
-            # try to re-login if the token expired
+        # including check on OSError due to https://stackoverflow.com/questions/74253820/cannot-catch-requests-exceptions-connectionerror-with-try-except
+        # ConnectionError may be thrown instead of getting a resp object with a checkable status code,
+        # but might still want to try again.
+        # manual retry examples:
+        # https://blog.miguelgrinberg.com/post/how-to-retry-with-class
+        # https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+        # add manual retry:
+        attempts = range(15)
+        for attempt in attempts:
+            if attempt > 0:
+                time.sleep(2)  # wait a bit between retries
+            try:
+                self.refresh_login()
+                self.request_count += 1
+                resp = requests.request(
+                    method,
+                    url,
+                    headers=self.headers,
+                    timeout=timeout,
+                    **kwargs,
+                )
+                if resp.status_code == 401 or resp.status_code == 403:
+                    logger.debug(f'/{endpoint} - {resp.status_code} - retrying')
+                    # try to re-login if the token expired
+                    continue
+                else:
+                    break
+            except (requests.exceptions.ConnectionError, OSError) as err:
+                if attempt < len(attempts) - 1:
+                    logger.debug(f'/{endpoint} - {str(err)} - retrying')
+                    continue
+                raise err
+            except Exception as err2:
+                raise err2
 
-            self.refresh_login()
-            self.request_count += 1
-            resp = requests.request(method, url, headers=self.headers, **kwargs)
         timing = millis_interval(start_time, datetime.now())
         logger.debug(f'/{endpoint} - {resp.status_code} - {timing} ms')  # type: ignore
 
@@ -171,15 +219,31 @@ def post(self, uri: str, data: Dict = {}, **kwargs) -> Dict:
     def login(self, username: str, password: str) -> None:
         self.username = username
         self.password = password
+        connect_timeout = 7
+        read_timeout = 61
 
         # use requests package directly to avoid recursion loop on login failure
-        self.request_count += 1
-        resp = requests.request(
-            url=f'{self.url}/token',
-            method='POST',
-            headers=self.headers,
-            data=json.dumps({'username': username, 'password': password}),
-        )
+        attempts = range(10)
+        for attempt in attempts:
+            if attempt > 0:
+                time.sleep(2)  # wait a bit between retries
+            try:
+                self.request_count += 1
+                resp = requests.request(
+                    url=f'{self.url}/token',
+                    method='POST',
+                    headers=self.headers,
+                    timeout=(connect_timeout, read_timeout),
+                    data=json.dumps({'username': username, 'password': password}),
+                )
+                break
+            except (requests.exceptions.ConnectionError, OSError) as err:
+                if attempt < len(attempts) - 1:
+                    logger.debug(f'/login - {str(err)} - retrying')
+                    continue
+                raise err
+            except Exception as err2:
+                raise err2
         resp.raise_for_status()
         content = resp.json()
         self.token = content['kbToken']
@@ -213,7 +277,10 @@ def query(
                 return self.cache[hash_code]
 
         while True:
-            content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)})
+            content = self.post(
+                'query',
+                data={**request_body, 'limit': limit, 'skip': len(result)},
+            )
             records = content['result']
             result.extend(records)
             if len(records) < limit or not paginate:
diff --git a/setup.cfg b/setup.cfg
index 9b25c96..34b143c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,13 +29,13 @@ install_requires =
 deploy = twine; wheel
 test = pytest; pytest-runner; pytest-cov
 doc = mkdocs; markdown_refdocs; mkdocs-material; mkdocs-redirects
-dev = 
+dev =
     twine
     wheel
     pytest
     pytest-runner
     pytest-cov
-    mkdocs 
+    mkdocs
     markdown_refdocs
     mkdocs-material
     mkdocs-redirects
@@ -44,6 +44,11 @@ dev =
     flake8-annotations
     isort
     mypy
+    xdist==1.27.0
+    forked==1.0.2
+    pluggy==0.13.1
+    pytest==6.1.2
+    cov==2.10.1
 
 [options.package_data]
 graphkb = py.typed

From ba34a5961cff6c0844802ddf9aa10a96284cad54 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Feb 2023 16:10:20 -0800
Subject: [PATCH 25/32] cleanup

---
 graphkb/util.py | 17 +++++++----------
 setup.cfg       |  5 -----
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/graphkb/util.py b/graphkb/util.py
index 4af5fe7..957ae1a 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -147,12 +147,14 @@ def request(
         """
         url = join_url(self.url, endpoint)
         self.request_count += 1
+        connect_timeout = 7
+        read_timeout = 61
 
         # don't want to use a read timeout if the request is not idempotent
         # otherwise you may wind up making unintended changes
         timeout = None
         if endpoint in ['query', 'parse']:
-            timeout = (connect_timeout = 7, read_timeout = 61)
+            timeout = (connect_timeout, read_timeout)
 
         start_time = datetime.now()
 
@@ -160,17 +162,12 @@ def request(
             self.first_request = start_time
         self.last_request = start_time
 
-        # including check on OSError due to https://stackoverflow.com/questions/74253820/cannot-catch-requests-exceptions-connectionerror-with-try-except
-        # ConnectionError may be thrown instead of getting a resp object with a checkable status code,
-        # but might still want to try again.
-        # manual retry examples:
-        # https://blog.miguelgrinberg.com/post/how-to-retry-with-class
-        # https://www.peterbe.com/plog/best-practice-with-retries-with-requests
-        # add manual retry:
+        # using a manual retry as well as using the requests Retry() object because
+        # a ConnectionError or OSError might be thrown and we still want to retry in those cases
         attempts = range(15)
         for attempt in attempts:
             if attempt > 0:
-                time.sleep(2)  # wait a bit between retries
+                time.sleep(2)  # wait between retries
             try:
                 self.refresh_login()
                 self.request_count += 1
@@ -226,7 +223,7 @@ def login(self, username: str, password: str) -> None:
         attempts = range(10)
         for attempt in attempts:
             if attempt > 0:
-                time.sleep(2)  # wait a bit between retries
+                time.sleep(2)  # wait between retries
             try:
                 self.request_count += 1
                 resp = requests.request(
diff --git a/setup.cfg b/setup.cfg
index 34b143c..a16c6f3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,11 +44,6 @@ dev =
     flake8-annotations
     isort
     mypy
-    xdist==1.27.0
-    forked==1.0.2
-    pluggy==0.13.1
-    pytest==6.1.2
-    cov==2.10.1
 
 [options.package_data]
 graphkb = py.typed

From 0bb90f7fa9d772bc8f09b2961a36aa3693f7eb59 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Feb 2023 16:18:36 -0800
Subject: [PATCH 26/32] add comment to document OSError check

---
 graphkb/util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/graphkb/util.py b/graphkb/util.py
index 957ae1a..00f2528 100644
--- a/graphkb/util.py
+++ b/graphkb/util.py
@@ -163,7 +163,9 @@ def request(
         self.last_request = start_time
 
         # using a manual retry as well as using the requests Retry() object because
-        # a ConnectionError or OSError might be thrown and we still want to retry in those cases
+        # a ConnectionError or OSError might be thrown and we still want to retry in those cases.
+        # about catching OSError as well as ConnectionError:
+        # https://stackoverflow.com/questions/74253820
         attempts = range(15)
         for attempt in attempts:
             if attempt > 0:

From 15952ead0325bb0b405fa5b3fcec4736e7639e9f Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Feb 2023 16:42:59 -0800
Subject: [PATCH 27/32] change version back to current release

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index a16c6f3..f035839 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ include_trailing_comma = true
 [metadata]
 name = graphkb
 url = https://github.com/bcgsc/pori_graphkb_python
-version = 1.9.0
+version = 1.8.0
 author_email = graphkb@bcgsc.ca
 description = python adapter for interacting with the GraphKB API
 long_description = file: README.md

From 1e6adfdf058f42d6f5e33d976cf614a94a708a43 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Thu, 16 Feb 2023 10:57:28 -0800
Subject: [PATCH 28/32] add long-running tests back to workflow

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index f5ccd00..dcfcdf6 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -40,7 +40,7 @@ jobs:
       env:
         GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }}
         GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }}
-        EXCLUDE_INTEGRATION_TESTS: 1
+        EXCLUDE_INTEGRATION_TESTS: 0
     - name: Upload pytest test results
       uses: actions/upload-artifact@master
       with:

From 7cc535315966d3a19d771da7a3c2d9e60c452933 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Thu, 16 Feb 2023 14:13:11 -0800
Subject: [PATCH 29/32] reduce runtime of longest test

---
 tests/test_genes.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/test_genes.py b/tests/test_genes.py
index 379c5e9..51b2a7c 100644
--- a/tests/test_genes.py
+++ b/tests/test_genes.py
@@ -16,14 +16,17 @@
     get_preferred_gene_name,
     get_therapeutic_associated_genes,
 )
+from graphkb.util import get_rid
 
 EXCLUDE_INTEGRATION_TESTS = os.environ.get('EXCLUDE_INTEGRATION_TESTS') == '1'
 
 CANONICAL_ONCOGENES = ['kras', 'nras', 'alk']
 CANONICAL_TS = ['cdkn2a', 'tp53']
 CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1']
+CANONICAL_STRUCTURAL_VARIANT_GENES = ['brca1', 'dpyd', 'pten']
 CANNONICAL_THERAPY_GENES = ['erbb2', 'brca2', 'egfr']
 
+
 PHARMACOGENOMIC_INITIAL_GENES = [
     'ACYP2',
     'CEP72',
@@ -149,11 +152,20 @@ def test_get_preferred_gene_name_kras(alt_rep, conn):
 
 
 @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
-def test_find_fusion_genes(conn):
-    result = get_genes_from_variant_types(conn, FUSION_NAMES)
+def test_find_genes_by_variant_type_structural_variant(conn):
+    result = get_genes_from_variant_types(conn, ['structural variant'])
     names = {row['name'] for row in result}
-    for gene in CANONICAL_FUSION_GENES:
-        assert gene in names, f"{gene} was not identified as a fusion gene."
+    for gene in CANONICAL_STRUCTURAL_VARIANT_GENES:
+        assert gene in names, f"{gene} was not identified as a structural variant gene."
+
+
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")
+def test_find_no_genes_by_variant_type_with_nonmatching_source_record_id(conn):
+    refseq_id = get_rid(conn, target='source', name='refseq')
+    result = get_genes_from_variant_types(
+        conn, ['structural variant'], source_record_ids=[refseq_id]
+    )
+    assert not result
 
 
 @pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests")

From 172789fa0f3fec2312ab0a0087df49b368ab7210 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Thu, 16 Feb 2023 17:25:31 -0800
Subject: [PATCH 30/32] add match test

---
 tests/test_match.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_match.py b/tests/test_match.py
index c70ca1e..90c1108 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -65,6 +65,15 @@ def test_expands_generalizations_refseq(self, alt_rep, conn):
         assert 'ENSG00000133703.11' in kras
         assert 'ENSG00000133703' in kras
 
+    def test_checks_by_source_id_kras(self, conn):
+        kras = [
+            f['displayName']
+            for f in match.get_equivalent_features(
+                conn, 'nm_033360', source='refseq', source_id_version='4', is_source_id=True
+            )
+        ]
+        assert 'KRAS' in kras
+
 
 class TestMatchCopyVariant:
     def test_bad_category(self, conn):

From 6f58d91cbc39b27af4795bb65a4b044a792d4988 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Thu, 16 Feb 2023 17:26:00 -0800
Subject: [PATCH 31/32] run in parallel

---
 .github/workflows/pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index dcfcdf6..f34a5ce 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -10,7 +10,7 @@ jobs:
 
     runs-on: ubuntu-latest
     strategy:
-      max-parallel: 1
+      max-parallel: 4
       matrix:
         python-version: ['3.7', '3.8', '3.9', '3.10']
 

From d5a993283c14b4a9cf904edd57ccd6a5e3d5e16c Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Wed, 22 Feb 2023 10:22:19 -0800
Subject: [PATCH 32/32] Release v1.9.0 New Features:  *
 get_statements_from_variants - helper function added.  * get_term_list -
 helper function added.  * get_rid - helper function added.  *
 get_pharmacogenomic_info - helper function added.  *
 get_cancer_predisposition_info - helper function added.  * Added constants
 PHARMACOGENOMIC_RELEVANCE_TERMS and PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST
 Bugfix:  * Ignore gene name version in get_equivalent_features. Eg.
 NM_033360.4 should match 'NM_033360' and 'KRAS'. Improvements:  *
 get_preferred_gene_name - moved to genes.py  * move CHOMOSOMES and
 PREFERRED_GENE_SOURCE to constants.py  * add retries on connection failure
 added tests:  * test_get_pharmacogenomic_info  *
 test_get_cancer_predisposition_info  * reduced runtime of longest test  * add
 match test Github Workflow Updates:  * Run in parallel  * Drop Python 3.6
 testing  * actions/checkout@v3  * actions/setup-python@v3

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f035839..a16c6f3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ include_trailing_comma = true
 [metadata]
 name = graphkb
 url = https://github.com/bcgsc/pori_graphkb_python
-version = 1.8.0
+version = 1.9.0
 author_email = graphkb@bcgsc.ca
 description = python adapter for interacting with the GraphKB API
 long_description = file: README.md