Skip to content

Commit

Permalink
Prepare release 6.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
juanfeSanahuja committed Mar 5, 2024
1 parent cb72ab6 commit f95bf41
Show file tree
Hide file tree
Showing 19 changed files with 173 additions and 73 deletions.
2 changes: 1 addition & 1 deletion cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \
git clone https://github.com/Ensembl/ensembl-compara.git && \
git clone https://github.com/Ensembl/ensembl-io.git

ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
8 changes: 4 additions & 4 deletions cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157";
our $ENSEMBL_GENOMES_USER = "anonymous";

## Vertebrates
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
Expand Down
2 changes: 1 addition & 1 deletion cellbase-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<parent>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase</artifactId>
<version>5.8.2</version>
<version>6.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion cellbase-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<parent>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase</artifactId>
<version>5.8.2</version>
<version>6.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion cellbase-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<parent>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase</artifactId>
<version>5.8.2</version>
<version>6.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public class DownloadProperties {
private EnsemblProperties ensembl;
private EnsemblProperties ensemblGenomes;
private URLProperties hgnc;
private URLProperties cancerHotspot;
private URLProperties refSeq;
private URLProperties refSeqFasta;
private URLProperties refSeqProteinFasta;
Expand Down Expand Up @@ -70,6 +71,7 @@ public class DownloadProperties {
private URLProperties hpoObo;
private URLProperties goObo;
private URLProperties doidObo;
private URLProperties mondoObo;
private URLProperties goAnnotation;
private URLProperties revel;
private URLProperties pubmed;
Expand Down Expand Up @@ -517,6 +519,24 @@ public DownloadProperties setHgnc(URLProperties hgnc) {
return this;
}

public URLProperties getCancerHotspot() {
return cancerHotspot;
}

public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) {
this.cancerHotspot = cancerHotspot;
return this;
}

public URLProperties getMondoObo() {
return mondoObo;
}

public DownloadProperties setMondoObo(URLProperties mondoObo) {
this.mondoObo = mondoObo;
return this;
}

public static class EnsemblProperties {

private DatabaseCredentials database;
Expand Down
74 changes: 55 additions & 19 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ download:
url:
host: ftp://ftp.ensemblgenomes.org/pub
hgnc:
host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2022-01-01.txt
host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt
version: 2023-11-01
cancerHotspot:
host: https://www.cancerhotspots.org/files/hotspots_v2.xls
version: "v2"
refSeq:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz
refSeqFasta:
Expand All @@ -73,12 +77,15 @@ download:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz
maneSelect:
# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz
host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
version: 0.93
# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz
version: "1.1"
lrg:
host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt
version: "2021-03-30"
geneUniprotXref:
host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/
version: "2023-11-08"
geneExpressionAtlas:
host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz
mirbase:
Expand All @@ -88,33 +95,49 @@ download:
targetScan:
host: http://hgdownload.cse.ucsc.edu/goldenPath/
miRTarBase:
host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/8.0/hsa_MTI.xlsx
host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
version: "9.0"

## Protein Data
uniprot:
host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
version: "2023-11-08"
uniprotRelNotes:
host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
intact:
host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
version: "2023-11-08"
interpro:
host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz
version: "2023-11-08"
interproRelNotes:
host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt
intact:
host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
version: "2023-10-07"

## Conservation Scores
conservation:
host: https://hgdownload.cse.ucsc.edu/goldenPath/
version: "2022-08-30"
gerp:
host: http://ftp.ensembl.org/pub/release-104/compara/conservation_scores/90_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
version: "2023-05-17"
clinvar:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz
version: "2023-12-01"
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
version: "2023-12-01"
clinvarVariationAllele:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz
version: "2023-12-01"
clinvarEfoTerms:
host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv
iarctp53:
Expand All @@ -132,30 +155,43 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
version: "1.0.2 associations_e106_r2022-05-17"
# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
# version: "1.0.2 associations_e106_r2022-05-17"
host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv
version: "23-12-21"
hpo:
## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
disgenet:
host: https://www.disgenet.org/static/disgenet_ap1/files/downloads
files:
- all_gene_disease_associations.tsv.gz
- readme.txt
dgidb:
host: https://dgidb.org/data/monthly_tsvs/2021-Jan/interactions.tsv
host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv
version: "2022-02-01"
cadd:
host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP!
# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz
version: "1.7-pre"
reactome:
host: http://www.reactome.org/download/current/biopax.zip
gnomadConstraints:
host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz
version: 2.1.1
version: "2.1.1"
hpoObo:
host: http://purl.obolibrary.org/obo/hp.obo
version: "2023-12-01"
goObo:
host: http://purl.obolibrary.org/obo/go/go-basic.obo
version: "2023-12-01"
doidObo:
host: http://purl.obolibrary.org/obo/doid.obo
version: "2023-12-01"
mondoObo:
host: http://purl.obolibrary.org/obo/mondo.obo
version: "2023-12-01"
goAnnotation:
host: http://geneontology.org/gene-associations/goa_human.gaf.gz
revel:
Expand All @@ -182,7 +218,7 @@ species:
- id: hsapiens
scientificName: Homo sapiens
assemblies:
- ensemblVersion: '104_38'
- ensemblVersion: '110_38'
name: GRCh38
- ensemblVersion: '82_37'
name: GRCh37
Expand Down
6 changes: 3 additions & 3 deletions cellbase-lib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<parent>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase</artifactId>
<version>5.8.2</version>
<version>6.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -137,10 +137,10 @@
<groupId>com.github.samtools</groupId>
<artifactId>htsjdk</artifactId>
</dependency>
<dependency>
<!-- <dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
</dependency>
</dependency>-->
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ public class EtlCommons {
public static final String HPO_FILE = "hp.obo";
public static final String GO_FILE = "go-basic.obo";
public static final String DOID_FILE = "doid.obo";
public static final String MONDO_FILE = "mondo.obo";
public static final String PFM_DATA = "regulatory_pfm";

// Build specific data options
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species
boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException {
this(null, geneDirectoryPath.resolve("description.txt"),
geneDirectoryPath.resolve("xrefs.txt"),
geneDirectoryPath.resolve("hgnc_complete_set_2022-01-01.txt"),
geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"),
geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"),
geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"),
geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"),
geneDirectoryPath.resolve("idmapping_selected.tab.gz"),
geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ public class OntologyBuilder extends CellBaseBuilder {
private Path hpoFile;
private Path goFile;
private Path doidFile;
private Path mondoFile;

public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) {
super(serializer);
hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE);
goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE);
doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE);
mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE);
}

@Override
Expand All @@ -64,6 +66,13 @@ public void parse() throws Exception {
serializer.serialize(term);
}

bufferedReader = FileUtils.newBufferedReader(mondoFile);
terms = parser.parseOBO(bufferedReader, "Mondo Ontology");
for (OntologyTerm term : terms) {
term.setSource("MONDO");
serializer.serialize(term);
}

serializer.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ public List<DownloadFile> download() throws IOException, InterruptedException {
downloadFiles.addAll(downloadRefSeq(refseqFolder));
downloadFiles.add(downloadMane(geneFolder));
downloadFiles.add(downloadLrg(geneFolder));
downloadFiles.add(downloadHgnc(geneFolder));
downloadFiles.add(downloadCancerHotspot(geneFolder));
downloadFiles.add(downloadDrugData(geneFolder));
downloadFiles.addAll(downloadGeneUniprotXref(geneFolder));
downloadFiles.add(downloadGeneExpressionAtlas(geneFolder));
Expand Down Expand Up @@ -208,6 +210,30 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte
return null;
}

private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading HGNC ...");
String url = configuration.getDownload().getHgnc().getHost();
saveVersionData(EtlCommons.GENE_DATA, "HGNC_GENE", configuration.getDownload().getHgnc().getVersion(),
getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("hgncVersion.json"));
String[] array = url.split("/");
return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
}
return null;
}

private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading Cancer Hotspot ...");
String url = configuration.getDownload().getCancerHotspot().getHost();
saveVersionData(EtlCommons.GENE_DATA, "CANCER_HOTSPOT", configuration.getDownload().getHgnc().getVersion(),
getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("cancerHotspotVersion.json"));
String[] array = url.split("/");
return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
}
return null;
}

private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading go annotation...");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto
public List<DownloadFile> download() throws IOException, InterruptedException {
List<DownloadFile> downloadFiles = new ArrayList<>();
downloadFiles.addAll(downloadReferenceGenome());
// downloadFiles.addAll(downloadConservation());
// downloadFiles.addAll(downloadRepeats());
downloadFiles.addAll(downloadConservation());
downloadFiles.addAll(downloadRepeats());

// cytobands
runGenomeInfo();
// runGenomeInfo();
return downloadFiles;
}

Expand Down Expand Up @@ -115,16 +115,16 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted
List<String> phastconsUrls = new ArrayList<>(chromosomes.length);
List<String> phyloPUrls = new ArrayList<>(chromosomes.length);
for (String chromosome : chromosomes) {
String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome
+ ".phastCons100way.wigFix.gz";
String phastConsUrl = url + "/phastCons470way/hg38.470way.phastCons/chr" + chromosome
+ ".phastCons470way.wigFix.gz";
downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons")
.resolve("chr" + chromosome + ".phastCons100way.wigFix.gz").toString()));
.resolve("chr" + chromosome + ".phastCons470way.wigFix.gz").toString()));
phastconsUrls.add(phastConsUrl);

String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome
+ ".phyloP100way.wigFix.gz";
String phyloPUrl = url + "/phyloP470way/hg38.470way.phyloP/chr" + chromosome
+ ".phyloP470way.wigFix.gz";
downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop")
.resolve("chr" + chromosome + ".phyloP100way.wigFix.gz").toString()));
.resolve("chr" + chromosome + ".phyloP470way.wigFix.gz").toString()));
phyloPUrls.add(phyloPUrl);
}
String gerpUrl = configuration.getDownload().getGerp().getHost();
Expand Down
Loading

0 comments on commit f95bf41

Please sign in to comment.