From 3af49599d8c2d95c1c6aaa0f5aef0407184920ed Mon Sep 17 00:00:00 2001 From: Roy Francis Date: Tue, 29 Oct 2024 22:44:48 +0100 Subject: [PATCH] Replaced R chunks with meta shortcodes --- _variables.yml | 2 + topics/rnaseq/lab_rnaseq.qmd | 469 +++++++++++------------------------ 2 files changed, 152 insertions(+), 319 deletions(-) create mode 100644 _variables.yml diff --git a/_variables.yml b/_variables.yml new file mode 100644 index 00000000..88e0e798 --- /dev/null +++ b/_variables.yml @@ -0,0 +1,2 @@ +# output directory used on gh-pages branch. format: YYMM +output-dir: "2411" diff --git a/topics/rnaseq/lab_rnaseq.qmd b/topics/rnaseq/lab_rnaseq.qmd index 4efab7f6..59e60a42 100644 --- a/topics/rnaseq/lab_rnaseq.qmd +++ b/topics/rnaseq/lab_rnaseq.qmd @@ -5,16 +5,6 @@ author: "Roy Francis / Dag Ahrén" format: html --- -```{r} -#| include: false -library(yaml) -library(here) -id_project <- yaml::read_yaml(here("_quarto.yml"))$id_project -path_resources <- paste0(yaml::read_yaml(here("_quarto.yml"))$path_resources,"/rnaseq") -path_rnaseq <- paste0(yaml::read_yaml(here("_quarto.yml"))$path_workspace,"/rnaseq") -url_cluster <- yaml::read_yaml(here("_quarto.yml"))$url_cluster -``` - ```{r} #| include: false @@ -102,9 +92,8 @@ If you have issues opening GUI windows from UPPMAX through the terminal, it is r If you are not on ThinLinc, connect to UPPMAX first. {{< fa exclamation-circle >}} Remember to replace username. -```{sh} -#| eval: false -ssh -XY username@`r url_cluster` +```sh +ssh -XY username@{{< meta url_cluster >}} ``` Book a node. @@ -113,11 +102,8 @@ For the RNA-Seq part of the course, we will work on the Rackham cluster. A stand Book compute resources for RNA-Seq lab. -```{r} -#| echo: false -#| class.output: "sh" -#| comment: "" -cat(paste0("salloc -A ",id_project," -t 03:00:00 -p shared -n 4 --no-shell")) +```sh +salloc -A {{< meta id_project >}} -t 03:00:00 -p shared -n 4 --no-shell ``` Check allocation. {{< fa exclamation-circle >}} Remember to replace **username**. @@ -138,8 +124,7 @@ Setting up the directory structure is an important step as it helps to keep our Create a directory named `rnaseq`. -```{sh} -#| eval: false +```sh mkdir rnaseq ``` @@ -161,16 +146,12 @@ rnaseq/ +-- plots ``` -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" - -cat(paste0('cd rnaseq +```sh +cd rnaseq mkdir 1_raw 2_fastqc 3_mapping 4_qualimap 5_dge 6_multiqc reference scripts funannot plots cd reference mkdir mouse_chr19_hisat2 -cd ..')) +cd .. ``` The `1_raw` directory will hold the raw fastq files (soft-links). `2_fastqc` will hold FastQC outputs. `3_mapping` will hold the mapping output files. `4_qualimap` will hold the post alignment QC output. `5_dge` will hold the counts from featureCounts and all differential gene expression related files. `6_multiqc` will hold MultiQC outputs. `reference` directory will hold the reference genome, annotations and aligner indices. The `funannot` and `plots` directory are optional for bonus steps. @@ -179,37 +160,28 @@ The `1_raw` directory will hold the raw fastq files (soft-links). `2_fastqc` wil #### Create symbolic links -We have the raw fastq files in this remote directory: ``r paste0(path_resources,"/main/1_raw/")``. We are going to create symbolic links (soft-links) for these files from our `1_raw` directory to the remote directory. We do this because fastq files tend to be large files and simply copying them would use up a lot of storage space. Soft-linking files and folders allows us to work with those files as if they were actually there. +We have the raw fastq files in this remote directory: `{{< meta path_resources >}}/rnaseq/dardel/main/1_raw/`. We are going to create symbolic links (soft-links) for these files from our `1_raw` directory to the remote directory. We do this because fastq files tend to be large files and simply copying them would use up a lot of storage space. Soft-linking files and folders allows us to work with those files as if they were actually there. Change to `1_raw` directory. Use `pwd` to check if you are standing in the correct directory. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat("cd 1_raw\npwd") +```sh +cd 1_raw +pwd ``` -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/1_raw")) +``` +{{< meta path_workspace >}}/rnaseq/1_raw ``` Run below to create softlinks. Note that the command ends in a space followed by a period. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("ln -s ",path_resources,"/main/1_raw/*.gz .")) +```sh +ln -s {{< meta path_resources >}}/rnaseq/dardel/main/1_raw/*.gz . ``` Check if your files have linked correctly. You should be able to see as below. -```{sh} -#| eval: false -#| class.output: "sh" +```sh ls -l ``` @@ -236,23 +208,18 @@ After receiving raw reads from a high throughput sequencing centre it is essenti {{< fa clipboard-list >}} Change into the `2_fastqc` directory. Use `pwd` to check if you are standing in the correct directory. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat("cd ../2_fastqc\npwd") +```sh +cd ../2_fastqc +pwd ``` -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/2_fastqc")) +``` +{{< meta path_workspace >}}/rnaseq/2_fastqc ``` Load Uppmax modules `bioinfo-tools` and FastQC `FastQC/0.11.9`. -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load FastQC/0.11.9 @@ -262,37 +229,30 @@ Once the module is loaded, FastQC program is available through the command `fast {{< fa exclamation-circle >}} Don't run this. It's just a template. -```{sh} -#| eval: false +```sh fastqc -o . ../1_raw/filename.fq.gz ``` Based on the above command, we will write a bash loop script to process all fastq files in the directory. Writing multi-line commands through the terminal can be a pain. Therefore, we will run larger scripts from a bash script file. Move to your `scripts` directory and create a new file named `fastqc.sh`. -```{r} -#| echo: false -#| class.output: "sh" -#| comment: "" -cat("cd ../scripts\npwd") +```sh +cd ../scripts +pwd ``` -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/scripts")) +``` +{{< meta path_workspace >}}/rnaseq/scripts ``` The command below creates a new file in the current directory. -```{sh} -#| eval: false +```sh touch fastqc.sh ``` Use a text editor (`nano`,`Emacs` etc.) to edit `fastqc.sh`. -```{sh} -#| eval: false +```sh nano fastqc.sh ``` @@ -318,21 +278,16 @@ This script loops through all files ending in .gz. In each iteration of the loop Change to the `2_fastqc` directory. Use `pwd` to check if you are standing in the correct directory. Run the script file `fastqc.sh` -```{r} -#| echo: false -#| class.output: "sh" -#| comment: "" -cat("cd ../2_fastqc\npwd") +```sh +cd ../2_fastqc +pwd ``` -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/2_fastqc")) +``` +{{< meta path_workspace >}}/rnaseq/2_fastqc ``` -```{sh} -#| eval: false +```sh bash ../scripts/fastqc.sh ``` @@ -342,20 +297,14 @@ After the fastqc run, there should be a `.zip` file and a `.html` file for every {{< fa exclamation-circle >}} Remember to replace **username**. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/2_fastqc/SRR3222409-19_1_fastqc.html .")) +```sh +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/2_fastqc/SRR3222409-19_1_fastqc.html . ``` or download the whole directory. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("scp -r username@", url_cluster, ":",path_rnaseq,"/2_fastqc .")) +```sh +scp -r username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/2_fastqc . ``` ::: {.callout-note} @@ -385,32 +334,27 @@ It is best if the reference genome (`.fasta`) and annotation (`.gtf`) files come You should be standing here to run this: -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/reference")) +```sh +{{< meta path_workspace >}}/rnaseq/reference ``` You are most likely to use the [latest version](https://www.ensembl.org/index.html) of ensembl release genome and annotations when starting a new analysis. For this exercise, we will choose ensembl version 99. -```{sh} -#| eval: false +```sh wget ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.chromosome.19.fa.gz wget ftp://ftp.ensembl.org/pub/release-99/gtf/mus_musculus/Mus_musculus.GRCm38.99.gtf.gz ``` Decompress the files for use. -```{sh} -#| eval: false +```sh gunzip Mus_musculus.GRCm38.dna.chromosome.19.fa.gz gunzip Mus_musculus.GRCm38.99.gtf.gz ``` From the full gtf file, we will also extract chr 19 alone to create a new gtf file for use later. -```{sh} -#| eval: false +```sh cat Mus_musculus.GRCm38.99.gtf | grep -E "^#|^19" > Mus_musculus.GRCm38.99-19.gtf ``` @@ -418,8 +362,7 @@ cat Mus_musculus.GRCm38.99.gtf | grep -E "^#|^19" > Mus_musculus.GRCm38.99-19.gt Check what you have in your directory. -```{sh} -#| eval: false +```sh ls -l ``` @@ -434,8 +377,7 @@ drwxrwsr-x 2 user gXXXXXXX 4.0K Jan 22 21:59 mouse_chr19_hisat2 Move into the `reference` directory if not already there. Load module HISAT2. Remember to load `bioinfo-tools` if you haven't done so already. -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load HISAT2/2.2.1 @@ -445,8 +387,7 @@ module load HISAT2/2.2.1 Create a new bash script in your `scripts` directory named `hisat2_index.sh` and add the following lines: -```{sh} -#| eval: false +```sh #!/bin/bash # load module @@ -464,15 +405,13 @@ The above script builds a HISAT2 index using the command `hisat2-build`. It shou Use `pwd` to check if you are standing in the correct directory. Then, run the script from the `reference` directory. -```{sh} -#| eval: false +```sh bash ../scripts/hisat2_index.sh ``` Once the indexing is complete, move into the `mouse_chr19_hisat2` directory and make sure you have all the files. -```{sh} -#| eval: false +```sh ls -l mouse_chr19_hisat2/ ``` @@ -495,16 +434,13 @@ Now that we have the index ready, we are ready to map reads. Move to the directo You should be standing here to run this: -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/3_mapping")) +``` +{{< meta path_workspace >}}/rnaseq/3_mapping ``` We will create softlinks to the fastq files from here to make things easier. -```{sh} -#| eval: false +```sh cd 3_mapping ln -s ../1_raw/* . ``` @@ -519,8 +455,7 @@ These are the parameters that we want to specify for the mapping run: HISAT2 aligner arguments can be obtained by running `hisat2 --help`. We should end with a script that looks like below to run one of the samples. -```{sh} -#| eval: false +```sh hisat2 \ -p 1 \ -x ../reference/mouse_chr19_hisat2/mouse_chr19_hisat2 \ @@ -539,8 +474,7 @@ But, we will not run it as above. We will make some more changes to it. We want {{< fa clipboard-list >}} Now create a new bash script file named `hisat2_align.sh` in your `scripts` directory and add the script below to it. -```{sh} -#| eval: false +```sh #!/bin/bash module load PDC/23.12 @@ -567,8 +501,7 @@ Lastly, the SAM output is piped (`|`) to the tool samtools for sorting and writt Now we can run the bash script like below while standing in the `3_mapping` directory. -```{sh} -#| eval: false +```sh bash ../scripts/hisat2_align.sh sample_1.fq.gz sample_2.fq.gz ``` @@ -580,9 +513,7 @@ bash ../scripts/hisat2_align.sh sample_1.fq.gz sample_2.fq.gz Try to create a new bash loop script (`hisat2_align_batch.sh`) to iterate over all fastq files in the directory and run the mapping using the `hisat2_align.sh` script. Note that there is a bit of a tricky issue here. You need to use two fastq files (`_1` and `_2`) per run rather than one file. There are many ways to do this and here is one. -```{sh} -#| eval: false - +```sh ## find only files for read 1 and extract the sample name lines=$(find *_1.fq.gz | sed "s/_1.fq.gz//") @@ -596,8 +527,7 @@ done Run the `hisat2_align_batch.sh` script in the `3_mapping` directory. -```{sh} -#| eval: false +```sh bash ../scripts/hisat2_align_batch.sh ``` @@ -605,8 +535,7 @@ bash ../scripts/hisat2_align_batch.sh At the end of the mapping jobs, you should have the following list of output files for every sample: -```{sh} -#| eval: false +```sh ls -l ``` @@ -617,8 +546,7 @@ ls -l The `.bam` file contains the alignment of all reads to the reference genome in binary format. BAM files are not human readable directly. To view a BAM file in text format, you can use `samtools view` functionality. -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load samtools/1.20 @@ -635,8 +563,7 @@ SRR3222409.13741570 163 19 3085066 60 15M2I84M = 3085166 201 ATAGTACCTGGCAACAAAA {{< fa clipboard-list >}} The summary file gives a summary of the mapping run. This file is used by MultiQC later to collect mapping statistics. Inspect one of the mapping log files to identify the number of uniquely mapped reads and multi-mapped reads. -```{sh} -#| eval: false +```sh cat SRR3222409-19.summary ``` @@ -662,8 +589,7 @@ Next, we need to index these BAM files. Indexing creates `.bam.bai` files which {{< fa clipboard-list >}} Index all BAM files. Write a for-loop to index all BAM files using the command `samtools index file.bam`. -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load samtools/1.19 @@ -677,8 +603,7 @@ for i in *.bam Now, we should have `.bai` index files for all BAM files. -```{sh} -#| eval: false +```sh ls -l ``` @@ -687,13 +612,10 @@ ls -l -rw-rw-r-- 1 user gXXXXXXX 43K Sep 19 12:32 SRR3222409-19.bam.bai ``` -{{< fa exclamation-circle >}} If you are running short of time or unable to run the mapping, you can copy over results for all samples that have been prepared for you before class. They are available at this location: ``r paste0(path_resources,"/main/3_mapping/")``. +{{< fa exclamation-circle >}} If you are running short of time or unable to run the mapping, you can copy over results for all samples that have been prepared for you before class. They are available at this location: `{{< meta path_resources >}}/rnaseq/dardel/main/3_mapping/`. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp -r ",path_resources,"/main/3_mapping/* ",path_rnaseq,"/rnaseq/3_mapping/")) +```sh +cp -r {{< meta path_resources >}}/rnaseq/dardel/main/3_mapping/* {{< meta path_workspace >}}/rnaseq/3_mapping/ ```
@@ -712,8 +634,7 @@ There are other tools with similar functionality such as [RNASeqQC](http://quali Add the following script to it. Note that we are using the smaller GTF file with chr19 only. -```{sh} -#| eval: false +```sh #!/bin/bash # load modules @@ -739,8 +660,7 @@ The line `prefix=$( basename "$1" .bam)` is used to remove directory path and `. {{< fa clipboard-list >}} Create a new bash loop script named `qualimap_batch.sh` with a bash loop to run the qualimap script over all BAM files. The loop should look like below. -```{sh} -#| eval: false +```sh for i in ../3_mapping/*.bam do echo "Running QualiMap on $i ..." @@ -750,8 +670,7 @@ done Run the loop script `qualimap_batch.sh` in the directory `4_qualimap`. -```{sh} -#| eval: false +```sh bash ../scripts/qualimap_batch.sh ``` @@ -764,8 +683,7 @@ drwxrwsr-x 5 user gXXXXXXX 4.0K Jan 22 22:53 SRR3222409-19 Inside every directory, you should see: -```{sh} -#| eval: false +```sh ls -l ``` @@ -781,22 +699,16 @@ drwxrwsr-x 2 user gXXXXXXX 4.0K Jan 22 22:53 raw_data_qualimapReport {{< fa exclamation-circle >}} When downloading the HTML files, note that you MUST also download the dependency files (ie; css folder and images_qualimapReport folder), otherwise the HTML file may not render correctly. Remember to replace **username** (twice). -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("scp -r username@", url_cluster,":",path_rnaseq,"/4_qualimap .")) +```sh +scp -r username@{{< meta url_cluster >}}/rnaseq:{{< meta path_workspace >}}/rnaseq/4_qualimap . ``` {{< fa comments >}} Check the QualiMap report for one sample and discuss if the sample is of good quality. You only need to do this for one file now. We will do a comparison with all samples when using the MultiQC tool. -{{< fa exclamation-circle >}} If you are running out of time or were unable to run QualiMap, you can also copy pre-run QualiMap output from this location: ``r paste0(path_resources,"/main/4_qualimap/")``. +{{< fa exclamation-circle >}} If you are running out of time or were unable to run QualiMap, you can also copy pre-run QualiMap output from this location: `{{< meta path_resources >}}/rnaseq/dardel/main/4_qualimap/`. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp -r ",path_resources,"/main/4_qualimap/* ",path_rnaseq,"/4_qualimap/")) +```sh +cp -r {{< meta path_resources >}}/rnaseq/dardel/main/4_qualimap/* {{< meta path_workspace >}}/rnaseq/4_qualimap/")) ```
@@ -815,8 +727,7 @@ We could run featureCounts on each BAM file, produce a text output for each samp Below is the script that we will use: -```{sh} -#| eval: false +```sh #!/bin/bash # load modules @@ -843,21 +754,17 @@ Run the featurecounts bash script in the directory `5_dge`. Use `pwd` to check You should be standing here to run this: -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/5_dge")) +``` +{{< meta path_workspace >}}/rnaseq/5_dge ``` -```{sh} -#| eval: false +```sh bash ../scripts/featurecounts.sh ``` You should have two output files: -```{sh} -#| eval: false +```sh ls -l ``` @@ -872,15 +779,12 @@ ls -l ### Important -For downstream steps, we will NOT use this `counts.txt` file. Instead we will use **counts_full.txt** from the back-up folder. This contains counts across all chromosomes. This is located here: ``r paste0(path_resources,"/main/5_dge/")``. Copy this file to your `5_dge` directory. +For downstream steps, we will NOT use this `counts.txt` file. Instead we will use **counts_full.txt** from the back-up folder. This contains counts across all chromosomes. This is located here: `{{< meta path_resources >}}/rnaseq/dardel/main/5_dge/`. Copy this file to your `5_dge` directory. {{< fa exclamation-circle >}} Remember to replace username. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/main/5_dge/counts_full.txt ",path_rnaseq,"/5_dge/")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/counts_full.txt {{< meta path_workspace >}}/rnaseq/5_dge/")) ``` ::: @@ -895,16 +799,13 @@ We will use the tool **MultiQC** to crawl through the output, log files etc from Move to the `6_multiqc` directory. You should be standing here to run this: -```{r} -#| echo: false -#| comment: "" -cat(paste0(path_rnaseq,"/6_multiqc")) +``` +{{< meta path_workspace >}}/rnaseq/6_multiqc ``` And run this in the terminal. -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load MultiQC/1.12 @@ -929,8 +830,7 @@ multiqc --interactive ../ The output should look like below: -```{sh} -#| eval: false +```sh ls -l ``` @@ -943,11 +843,8 @@ drwxrwsr-x 2 user gXXXXXXX 4.0K Sep 6 22:33 multiqc_data {{< fa exclamation-circle >}} Run this step in a **LOCAL** terminal and **NOT** on Uppmax. Remember to replace username (twice). -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/6_multiqc/multiqc_report.html .")) +```sh +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/6_multiqc/multiqc_report.html . ```
@@ -960,27 +857,22 @@ The easiest way to perform differential expression is to use one of the statisti Move to the `5_dge` directory and load R modules for use. -```{sh} -#| eval: false +```sh module load R/4.4.0 ``` -Use `pwd` to check if you are standing in the correct directory. Copy the following file to the `5_dge` directory: ``r paste0(path_resources,"/main/5_dge/dge.R")`` +Use `pwd` to check if you are standing in the correct directory. Copy the following file to the `5_dge` directory: `{{< meta path_resources >}}/rnaseq/dardel/main/5_dge/dge.R` -Make sure you have the `counts_full.txt`. If not, you can copy this file too: ``r paste0(path_resources,"/main/5_dge/counts_full.txt")`` +Make sure you have the `counts_full.txt`. If not, you can copy this file too: `{{< meta path_resources >}}/rnaseq/dardel/main/5_dge/counts_full.txt` -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/main/5_dge/dge.R .\n")) -cat(paste0("cp ",path_resources,"/main/5_dge/counts_full.txt .\n")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/dge.R . +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/counts_full.txt . ``` Now, run the R script from the shell in `5_dge` directory. -```{sh} -#| eval: false +```sh Rscript dge.R ``` @@ -988,8 +880,7 @@ If you are curious what's inside dge.R, you are welcome to explore it using a te This should have produced the following output files: -```{sh} -#| eval: false +```sh ls -l ``` @@ -1008,14 +899,11 @@ Essentially, we have two outputs: **dge_results_full** and **counts_vst_full**. {{< fa lightbulb >}} If you do not have the results or were unable to run the DGE step, you can copy these two here which will be required for functional annotation (optional). -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/main/5_dge/dge_results_full.txt .\n")) -cat(paste0("cp ",path_resources,"/main/5_dge/dge_results_full.Rds .\n")) -cat(paste0("cp ",path_resources,"/main/5_dge/counts_vst_full.txt .\n")) -cat(paste0("cp ",path_resources,"/main/5_dge/counts_vst_full.Rds .\n")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/dge_results_full.txt . +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/dge_results_full.Rds . +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/counts_vst_full.txt . +cp {{< meta path_resources >}}/rnaseq/dardel/main/5_dge/counts_vst_full.Rds . ``` ## Bonus exercises @@ -1041,51 +929,41 @@ In this part, we will use the same data as in the main workflow. The starting po Change to the `funannot` directory in your `rnaseq` directory. -```{sh} -#| eval: false +``` cd funannot ``` -Copy this file ``r paste0(path_resources,"/bonus/funannot/annotate_de_results.R")`` to your `rnaseq/funannot` directory. +Copy this file `{{< meta path_resources >}}/rnaseq/dardel/bonus/funannot/annotate_de_results.R` to your `rnaseq/funannot` directory. {{< fa exclamation-circle >}} Remember to replace username. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/bonus/funannot/annotate_de_results.R ",path_rnaseq,"/rnaseq/funannot/")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/bonus/funannot/annotate_de_results.R {{< meta path_workspace >}}/rnaseq/funannot/ ``` -We need some annotation information, so we will copy this file ``r paste0(path_resources,"/main/reference/mm-biomart99-genes.txt.gz")`` to your `rnaseq/reference` directory. +We need some annotation information, so we will copy this file `{{< meta path_resources >}}/rnaseq/dardel/main/reference/mm-biomart99-genes.txt.gz` to your `rnaseq/reference` directory. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/main/reference/mm-biomart99-genes.txt.gz ",path_rnaseq,"/rnaseq/reference/")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/main/reference/mm-biomart99-genes.txt.gz {{< meta path_workspace >}}/rnaseq/reference/ ``` Then uncompress the file. `gunzip ../reference/mm-biomart99-genes.txt.gz` Load R module and R packages -```{sh} -#| eval: false +```sh module load R/4.4.0 ``` Run the functional annotation script from the linux terminal. -```{sh} -#| eval: false +```sh Rscript annotate_de_results.R ``` It will fetch **dge_results_full.Rds** from `5_dge/` and annotation files from `reference/`. When completed, you should find a directory named **funannot_results**. -```{sh} -#| eval: false +```sh ls -l ``` @@ -1102,8 +980,7 @@ The results are saved as tables in the directory **funannot_results**. There are {{< fa clipboard-list >}} Take a quick look at some of these files. -```{sh} -#| eval: false +```sh head go_up.txt ``` @@ -1133,8 +1010,7 @@ You can view the tables and try to find GO terms and pathways relevant to the ex {{< fa clipboard-list >}} Try to use `grep` to find a match using a keyword, say **phosphorylation**. -```{sh} -#| eval: false +```sh cat go_down.txt | grep "phosphorylation" ``` @@ -1144,21 +1020,17 @@ cat go_down.txt | grep "phosphorylation" Creating high quality plots of RNA-seq analysis are most easily done using R. Depending on your proficiency in reading R code and using R, you can in this section either just call scripts from the command lines with a set of arguments or you can open the R script in a text editor, and run the code step by step from an interactive R session. -Copy the R script files from the following directory: ``r paste0(path_resources,"/bonus/plots/")`` to your `plots` directory. +Copy the R script files from the following directory: `{{< meta path_resources >}}/rnaseq/dardel/bonus/plots/` to your `plots` directory. {{< fa exclamation-circle >}} Remember to replace username. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("cp ",path_resources,"/bonus/plots/*.R ",path_rnaseq,"/plots/")) +```sh +cp {{< meta path_resources >}}/rnaseq/dardel/bonus/plots/*.R {{< meta path_workspace >}}/rnaseq/plots/ ``` You should have the following files: -```{sh} -#| eval: false +```sh ls -l ``` @@ -1172,8 +1044,7 @@ ls -l It is important that you load module R and R_packages. -```{sh} -#| eval: false +```sh module load R/4.4.0 ``` @@ -1183,18 +1054,13 @@ A popular way to visualise general patterns of gene expression in your data is t Move to the `plots/` directory. Then run the `pca.R` script like below. -```{sh} -#| eval: false +```sh Rscript pca.R ``` This generates a file named **pca.png** in the `plots` folder. To view it, use `eog pca.png &` or copy it to your local disk. -```{r} -#| echo: false -#| out.width: 500px -knitr::include_graphics("assets/pca.png") -``` +![](assets/pca.png){width="500px"} {{< fa comments >}} Do samples cluster as expected? Are there any odd or mislabelled samples? Based on these results, would you expect to find a large number of significant DE genes? @@ -1204,18 +1070,13 @@ An MA-plot plots the mean expression and estimated log-fold-change for all genes Run the `ma.R` script in the `plots` directory. -```{sh} -#| eval: false +```sh Rscript ma.R ``` This generates a file named **ma.png** in the `plots` folder. To view it, use `eog ma.png &` or copy it to your local disk. -```{r} -#| echo: false -#| out.width: 500px -knitr::include_graphics("assets/ma.png") -``` +![](assets/ma.png){width="500px"} {{< fa comments >}} What do you think the blue dots represent? @@ -1225,18 +1086,13 @@ A related type of figure will instead plot fold change (on log2 scale) on the x- Run the script named `volcano.R` in the `plots` directory. -```{sh} -#| eval: false +```sh Rscript volcano.R ``` This generates a file named **volcano.png** in the `plots` folder. To view it, use `eog volcano.png &` or copy it to your local disk. -```{r} -#| echo: false -#| out.width: 500px -knitr::include_graphics("assets/volcano.png") -``` +![](assets/volcano.png"{width="500px"} {{< fa comments >}} Anything noteworthy about the patterns in the plot? Is there a general trend in the direction of change in gene expression as a consequence of the experiment? @@ -1246,18 +1102,13 @@ Another popular plots for genome-wide expression patterns is heatmap for a set o Run the script named `heatmap.R` in the `plots` directory. -```{sh} -#| eval: false +```sh Rscript heatmap.R ``` This generates a file named **heatmap.png** in the `plots` folder. To view it, use `eog heatmap.png &` or copy it to your local disk. -```{r} -#| echo: false -#| out.width: 500px -knitr::include_graphics("assets/heatmap.png") -``` +![](assets/heatmap.png){width="500px"} {{< fa clipboard-list >}} Compare this plot to a similar plot in the paper behind the data. @@ -1279,14 +1130,11 @@ Copy two BAM files (one from each experimental group, for example; SRR3222409-19 {{< fa exclamation-circle >}} Remember to replace **username**. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/rnaseq/3_mapping/SRR3222409-19.bam ./\n")) -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/rnaseq/3_mapping/SRR3222409-19.bam.bai ./\n")) -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/rnaseq/3_mapping/SRR3222412-19.bam ./\n")) -cat(paste0("scp username@", url_cluster, ":",path_rnaseq,"/rnaseq/3_mapping/SRR3222412-19.bam.bai ./\n")) +```sh +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/3_mapping/SRR3222409-19.bam . +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/3_mapping/SRR3222409-19.bam.bai . +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/3_mapping/SRR3222412-19.bam . +scp username@{{< meta url_cluster >}}:{{< meta path_workspace >}}/rnaseq/3_mapping/SRR3222412-19.bam.bai . ``` Alternatively, you can use an SFTP browser like [Filezilla](https://filezilla-project.org/) or [Cyberduck](https://cyberduck.io/) for a GUI interface. Windows users can also use the MobaXterm SFTP file browser to drag and drop. @@ -1300,8 +1148,8 @@ For Linux and Mac users, Log in to Uppmax in a way so that the generated graphic Login in to Uppmax with X-forwarding enabled: -```bash -ssh -Y username@`r url_cluster` +```sh +ssh -Y username@{{< meta url_cluster >}} ssh -Y computenode ``` @@ -1309,8 +1157,7 @@ An alternative method is to login through [Rackham-GUI](https://rackham-gui.uppm Load necessary modules and start IGV -```{sh} -#| eval: false +```sh module load PDC/23.12 module load bioinfo-tools module load IGV/2.16.0 @@ -1370,16 +1217,11 @@ Have a look at few of the interesting genes on Chr19 using the `external_gene_na To see some genes with large number of reads, see **Scd1** or **Scd2**. -![](assets/igv-tm7sf2.png) -IGV view of gene **Tm7sf2** across 6 samples. - -![](assets/igv-ankrd2.png) +![IGV view of gene **Tm7sf2** across 6 samples.](assets/igv-tm7sf2.png) -IGV view of gene **Ankrd2** across 6 samples. +![IGV view of gene **Ankrd2** across 6 samples.](assets/igv-ankrd2.png) -![](assets/igv-scd1.png) - -IGV view of gene **Scd1** across 6 samples. +![IGV view of gene **Scd1** across 6 samples.](assets/igv-scd1.png) For more detailed information on the splice reads you can instead of just looking at the splice panel right click on the read panel and select **Sashimi plots**. This will open a new window showing in an easy readable fashion how reads are spliced in mapping and you will also be able to see that there are differences in between what locations reads are spliced. This hence gives some indication on the isoform usage of the gene. @@ -1399,8 +1241,7 @@ The standard workflow on Uppmax is to login to the login node and then submit ta This is how our standard bash script for mapping looks like: -```{sh} -#| eval: false +```sh #!/bin/bash # load modules @@ -1422,12 +1263,9 @@ hisat2 \ We add `SBATCH` commands to the above script. The new script looks like this: -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0('#!/bin/bash -#SBATCH -A ',id_project,' +```sh +#!/bin/bash +#SBATCH -A {{< meta id_project >}} #SBATCH -p shared #SBATCH -n 8 #SBATCH -t 3:00:00 @@ -1447,8 +1285,7 @@ hisat2 \\ -x ../reference/mouse_chr19_hisat2/mouse_chr19_hisat2 \\ --summary-file "${prefix}-summary.txt" \\ -1 $1 \\ - -2 $2 | samtools sort -O BAM > "${prefix}.bam"' -)) + -2 $2 | samtools sort -O BAM > "${prefix}.bam ``` The `SBATCH` commands in the above script is specifying the account name to use resources from, the required number of cores, the time required for the job and a job name. @@ -1457,8 +1294,7 @@ The `SBATCH` commands in the above script is specifying the account name to use You can check your jobs in the queue by running the following command. -```{sh,eval=FALSE} -#| eval: false +```sh jobinfo -u user ``` @@ -1477,8 +1313,7 @@ Running jobs: If the job is pending, then you will see `PD` in the `ST` column. If your job is running, you should see `R`. Once your job starts running, you will see a file named `slurm-XXXX.out` in the directory in which you submitted the job. This is the **standard-out** from that job. ie; everything that you would normally see printed to your screen when running locally, is printed to this file when running as a job. Once the job is over, one would inspect the slurm output file. -```{sh} -#| eval: false +```sh head slurm-XXXX.out tail slurm-XXXX.out cat slurm-XXXX.out @@ -1521,13 +1356,10 @@ star_index = "/sw/data/igenomes/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/v And lastly we have a **nextflow.sh** which will be our bash script for launching the job. -```{r} -#| echo: false -#| comment: "" -#| class.output: "sh" -cat(paste0('#!/bin/bash +```sh +#!/bin/bash -#SBATCH -A ',id_project,' +#SBATCH -A {{< meta id_project >}} #SBATCH -p shared #SBATCH -n 4 #SBATCH -t 4:00:00 @@ -1539,8 +1371,7 @@ module load nextflow/24.04.2 module load nf-core/2.6 NXF_HOME=. -nextflow run nf-core/rnaseq -r 3.7 -c params.config -profile uppmax --project ',id_project,' -resume' -)) +nextflow run nf-core/rnaseq -r 3.7 -c params.config -profile uppmax --project {{< meta id_project >}} -resume' ``` Notice that we use `-profile uppmax` since this is run on Uppmax. The job is then submitted by simply running `sbatch nextflow.sh`.