Merge branch 'main' into czhu-version-bump

uclahs-cds · Apr 28, 2022 · d1f6fec · d1f6fec
2 parents 39b7c35 + 8598165
commit d1f6fec
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,10 +21,17 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Fixed the issue that in `splitFasta`, variant sources are not grouped as they are specified by `--group-source` #439
 
+
 ### Added
 
 - Resources usage including memory, CPU and time is now printed to stdout in the end of all command line programs.
 
+
+### Fixed
+
+- Fixed issue that `--additional-split` not recognized properly in `splitFasta`. #443
+
+
 ---
 
 ## [0.4.0] - 2022-03-17

diff --git a/docs/split-fasta.md b/docs/split-fasta.md
@@ -55,6 +55,25 @@ moPepGen splitFasta \
 
 This example outputs three split FASTA filese, `split_Coding.fasta`, `split_RNAEditing.fasta`, and `split_Remaining.fasta`.
 
+### Additional split
+
+Additional split allows you split the records with the source group specified that would otherwise be placed in to the remaining FASTA. See example below.
+
+```bash
+moPepGen splitFasta \
+  --gvf \
+    path/to/gSNP.gvf \
+    path/to/gINDEL.gvf \
+    path/to/reditools.gvf \
+  --variant-peptides path/to/variant.fasta \
+  --index-dir path/to/index \
+  --max-source-groups 1 \
+  --additional-split gSNP-gINDL gSNP-RNAEditing \
+  --output-prefix path/to/split
+```
+
+As result, `split_gSNP.fasta` and `split_gINDLE.fasta` will be written. `split_gSNP-gINDEL.fasta` and `split_gSNP-RNAEditign.fasta` will also be written although the number of variant sources (2) are larger than the value specified through `--max-source-groups`.
+
 ## Arguments
 
 {% with actions=get_arg_data(command) %}

diff --git a/moPepGen/cli/split_fasta.py b/moPepGen/cli/split_fasta.py
@@ -82,9 +82,11 @@ def add_subparser_split_fasta(subparser:argparse._SubParsersAction):
         type=str,
         help='For peptides that were not already split into FASTAs up to'
         'max_source_groups, those involving the following source will be split'
-        'into additional FASTAs with decreasing priority',
+        'into additional FASTAs with decreasing priority. E.g., '
+        "'gSNP-Noncoding', 'gSNP-Noncoding gSNP-gINDEL'",
         metavar='<value>',
-        default=None
+        default=None,
+        nargs="*"
     )
 
     common.add_args_reference(p, genome=False, proteome=True)
@@ -141,7 +143,7 @@ def split_fasta(args:argparse.Namespace) -> None:
 
     additional_split = args.additional_split or []
     sep = SPLIT_DATABASE_KEY_SEPARATER
-    additional_split = [{x.split(sep)} for x in additional_split]
+    additional_split = [set(x.split(sep)) for x in additional_split]
     splitter.split(args.max_source_groups, additional_split, anno)
 
     if not args.quiet:

diff --git a/test/integration/test_split_fasta.py b/test/integration/test_split_fasta.py
@@ -110,3 +110,25 @@ def test_split_fasta_case3(self):
             'test_circRNA.fasta', 'test_Remaining.fasta', 'test_circRNA.fasta',
             'test_Noncoding.fasta'}
         self.assertEqual(files, expected)
+
+    def test_split_fasta_case4(self):
+        """ test splitFasta case 4 with additional split """
+        args = self.create_base_args()
+        args.gvf = [
+            self.data_dir/'vep/vep_gSNP.gvf',
+            self.data_dir/'vep/vep_gINDEL.gvf',
+            self.data_dir/'reditools/reditools.gvf',
+            self.data_dir/'fusion/star_fusion.gvf',
+            self.data_dir/'circRNA/circ_rna.gvf'
+        ]
+        args.variant_peptides = self.data_dir/'peptides/variant.fasta'
+        args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta'
+        args.annotation_gtf = self.data_dir/'annotation.gtf'
+        args.proteome_fasta = self.data_dir/'translate.fasta'
+        args.additional_split = ['Noncoding-gSNP']
+        cli.split_fasta(args)
+        files = {str(file.name) for file in self.work_dir.glob('*')}
+        expected = {'test_gINDEL.fasta','test_gSNP.fasta',
+            'test_RNAEditingSite.fasta', 'test_circRNA.fasta',
+            'test_Remaining.fasta', 'test_circRNA.fasta', 'test_Noncoding.fasta'}
+        self.assertEqual(files, expected)