Skip to content

Commit

Permalink
Merge pull request #693 from opencb/TASK-6347
Browse files Browse the repository at this point in the history
TASK-6347 Fix normalization issue in ClinVar
  • Loading branch information
jtarraga authored Jun 26, 2024
2 parents a618d75 + f820260 commit dcbb95c
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 91 deletions.
14 changes: 10 additions & 4 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,15 @@ download:
clinvar:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz
version: 2024-05
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz
version: 2024-05
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
clinvarVariationAllele:
Expand All @@ -135,8 +139,10 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
version: "1.0.2 associations_e106_r2022-05-17"
#host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv"
#version: "1.0.2 associations_e106_r2022-05-17"
version: "2024-05-20"
hpo:
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
disgenet:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ public class EtlCommons {
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_VERSION = "2022.11";
public static final String CLINVAR_DATE = "2022-11";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz";
public static final String CLINVAR_VERSION = "2024-05";
public static final String CLINVAR_DATE = "2024-05";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ private void printSummary() {
}

private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields,
String mateVariantString, Map<String, EFO> traitsToEfoTermsMap)
String mateVariantString, Map<String, EFO> traitsToEfoTermsMap)
throws RocksDBException, IOException {
// More than one variant being returned from the normalisation process would mean it's and MNV which has been
// decomposed
Expand Down Expand Up @@ -266,13 +266,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy
}

// parse RCVs
String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
.getClinicalSignificance()
.getDescription();
String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
.getReviewStatus().name();
List<ObservationSet> getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
String accession = null;
try {
accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
} catch (Exception e) {
logger.warn("Error getting accession. Ignore error and leave it as null.", e);
}
String clinicalSignficanceDescription = null;
try {
clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
.getClinicalSignificance()
.getDescription();
} catch (Exception e) {
logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e);
}
String reviewStatusName = null;
try {
reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
.getReviewStatus().name();
} catch (Exception e) {
logger.warn("Error getting review status name. Ignore error and leave it as null.", e);
}
List<ObservationSet> getObservedIn = null;
try {
getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
} catch (Exception e) {
logger.warn("Error getting observed in. Ignore error and leave it as null.", e);
}

addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString,
clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn);
Expand Down Expand Up @@ -388,7 +409,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
Map<String, EFO> traitsToEfoTermsMap, String accession,
String clinicalSignficanceDescription, String reviewStatusName,
List<ObservationSet> getObservedIn)
throws JsonProcessingException {
throws JsonProcessingException {

List<Property> additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
Expand Down Expand Up @@ -544,7 +565,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) {
private List<GenomicFeature> getGenomicFeature(PublicSetType publicSet, String alleleId) {
if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) {
return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet());
// No measureSet means there must be genotypeSet
// No measureSet means there must be genotypeSet
} else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) {
for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) {
if (measureSet.getMeasure() != null) {
Expand Down Expand Up @@ -596,7 +617,7 @@ private List<HeritableTrait> getHeritableTrait(PublicSetType publicSet, Map<Stri
// root of the ReferenceClinvarAssertion rather than for each trait
ModeOfInheritance modeOfInheritance
= getInheritanceModel(publicSet.getReferenceClinVarAssertion().getAttributeSet(),
sourceInheritableTraitMap);
sourceInheritableTraitMap);

for (TraitType trait : publicSet.getReferenceClinVarAssertion().getTraitSet().getTrait()) {
String traitName = getTraitName(trait, publicSet);
Expand Down Expand Up @@ -649,14 +670,14 @@ private String getTraitName(TraitType trait, PublicSetType publicSet) {
// Found preferred name
if (i < trait.getName().size()) {
return trait.getName().get(i).getElementValue().getValue();
// No preferred name indicated (e.g. RCV000013735 version Jan 2020); arbitrarily return first one
// No preferred name indicated (e.g. RCV000013735 version Jan 2020); arbitrarily return first one
} else if (trait.getName().size() > 0) {
logger.warn("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
+ " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName()
.get(0).getElementValue().getValue());
return trait.getName().get(0).getElementValue().getValue();
// No trait name provided at all
// No trait name provided at all
} else {
throw new IllegalArgumentException("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,12 @@ public abstract class ClinicalIndexer {
protected VariantNormalizer normalizer;

public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException {
// Forcing decomposition here in all cases - assuming the way CellBase stores clinical variants from here
// onwards will be decomposed and Adaptors will deal with phased/no-phased queries
// Use the same OpenCGA normalization parameters
VariantNormalizer.VariantNormalizerConfig variantNormalizerConfig
= (new VariantNormalizer.VariantNormalizerConfig())
.setReuseVariants(true)
.setNormalizeAlleles(false)
.setDecomposeMNVs(true);
.setNormalizeAlleles(true)
.setDecomposeMNVs(false);

if (genomeSequenceFilePath != null) {
logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class CosmicIndexer extends ClinicalIndexer {
private Pattern mutationGRCh37GenomePositionPattern;
private Pattern snvPattern;

private static final String COSMIC_VERSION = "v95";
private static final String COSMIC_VERSION = "v99";

private static final int GENE_NAMES_COLUMN = 0;
private static final int HGNC_COLUMN = 3;
Expand Down
Loading

0 comments on commit dcbb95c

Please sign in to comment.