Merge pull request #1 from hgb-bin-proteomics/develop

v1.0.0 release
hgb-bin-proteomics · Jan 21, 2024 · ca5b530 · ca5b530
2 parents 8626788 + 1326fbe
commit ca5b530
Show file tree

Hide file tree

Showing 8 changed files with 433 additions and 1 deletion.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,47 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# Reference workflow provided by (c) GitHub
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: msannika_merge
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Copy scripts and data to "/tests"
+      run: |
+        cp msannika_merge.py tests
+        cp data/DSSO_CSMs.xlsx .
+        cp data/ncDSSO_CSMs.xlsx .
+        wget https://raw.githubusercontent.com/hgb-bin-proteomics/MSAnnika_FDR/master/msannika_fdr.py
+        cp msannika_fdr.py tests
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest tests/tests.py
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,28 @@
+cff-version: 1.2.0
+preferred-citation:
+  type: article
+  authors:
+  - family-names: "Birklbauer"
+    given-names: "Micha J."
+    orcid: "https://orcid.org/0009-0005-1051-179X"
+  - family-names: "Matzinger"
+    given-names: "Manuel"
+    orcid: "https://orcid.org/0000-0002-9765-7951"
+  - family-names: "Müller"
+    given-names: "Fränze"
+    orcid: "https://orcid.org/0000-0003-3764-3547"
+  - family-names: "Mechtler"
+    given-names: "Karl"
+    orcid: "https://orcid.org/0000-0002-3392-9946"
+  - family-names: "Dorfer"
+    given-names: "Viktoria"
+    orcid: "https://orcid.org/0000-0002-5332-5701"
+  doi: "10.1021/acs.jproteome.3c00325"
+  journal: "Journal of Proteome Research"
+  month: 9
+  start: 3009
+  end: 3021
+  title: "MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity"
+  issue: 9
+  volume: 22
+  year: 2023
diff --git a/README.md b/README.md
@@ -1 +1,167 @@
-# MSAnnika_Combine_Results
+![workflow_state](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/workflows/msannika_merge/badge.svg)
+
+# MS Annika Combine Results
+
+A script to merge and optionally validate several [MS Annika](https://github.com/hgb-bin-proteomics/MSAnnika)
+search results. The main use case would be for merging results from different MS
+Annika runs, e.g. combining results from a cleavable and non-cleavable MS Annika
+search or combining results from different doublet distances.
+
+## Usage
+
+- Install python 3.7+: [https://www.python.org/downloads/](https://www.python.org/downloads/)
+- Install requirements: `pip install -r requirements.txt`
+- Export MS Annika CSM results from Proteome Discoverer to Microsoft Excel format.
+  - **Important:** CSMs should not be filtered! Export all (unvalidated) CSMs including decoy hits!
+- Run `python msannika_merge.py filename1.xlsx filename2.xlsx -fdr 0.01` (see below for more examples).
+- The script may take a few minutes, depending on the number of CSMs to process.
+- Done!
+
+## Examples
+
+`msannika_merge.py` takes one positional and two optional arguments. The first
+argument always has to be the filename(s) of the MS Annika CSM result file(s).
+You may specify any number of result files! For demonstration purposes we will
+use the files supplied in the `/data` folder:
+- `DSSO_CSMs.xlsx` contains unvalidated CSMs from a cleavable MS Annika search
+using the crosslinker DSSO.
+- `ncDSSO_CSMs.xlsx` contains unvalidated CSMs from a non-cleavable MS Annika
+search using the crosslinker DSSO.
+
+The following is a valid `msannika_merge.py` call:
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx
+```
+
+This will merge CSMs from all given files, in this case `DSSO_CSMs.xlsx` and
+`ncDSSO_CSMs.xlsx` into a result file called `CSMs_merged.xlsx`. You can also
+set a prefix for the generated result file(s) like this:
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -o All_CSMs.xlsx
+```
+
+This will merge CSMs from all given files, exactly like the last command, but
+the generated result file will now be named `All_CSMs_merged.xlsx`.
+
+If you suppy the optional argument `-fdr` or `--false_discovery_rate` and the
+desired FDR as a floating point number, the CSMs will be merged, then validated,
+then grouped by sequence and position to crosslinks and those crosslinks will
+again be validated for the given FDR. To group CSMs and validate CSMs and
+crosslinks the [MS Annika FDR](https://github.com/hgb-bin-proteomics/MSAnnika_FDR)
+script is downloaded and used. Validation therefore requires an active internet
+connection!
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -fdr 0.01
+```
+
+This will merge CSMs from all given files, then validate the merged CSMs for
+estimated 1% FDR, then group CSMs to crosslinks and finally validate the
+crosslinks for estimated 1% FDR. The following files will be generated:
+- `CSMs_merged.xlsx`: The merged CSMs from all given files.
+- `CSMs_merged_validated.xlsx`: The merged CSMs that are above the estimated 1%
+FDR threshold.
+- `Crosslinks.xlsx`: The crosslinks that result from grouping the merged CSMs.
+- `Crosslinks_validated.xlsx`: The crosslinks that are above the estimated 1%
+FDR threshold.
+
+Note that the following command will produce the same output (FDR values >= 1
+will automatically be divided by 100):
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -fdr 1
+```
+
+## Parameters
+
+```python
+"""
+DESCRIPTION:
+A script to combine results from several MS Annika searches.
+USAGE:
+msannika_merge.py f [f ...]
+                    [-fdr FDR][--false_discovery_rate FDR]
+                    [-h][--help]
+                    [--version]
+positional arguments:
+  f                     MS Annika result files in Microsoft Excel format (.xlsx)
+                        to process. MS Annika result files have to be
+                        unvalidated CSMs including decoys!
+optional arguments:
+  -fdr FDR, --false_discovery_rate FDR
+                        False discovery rate to validate results for. Supports
+                        both percentage input (e.g. 1) or fraction input (e.g.
+                        0.01). By default not set and results will only be
+                        merged. Validation requires internet connection because
+                        the MS Annika FDR module will be downloaded to calculate
+                        FDR.
+                        Default: None
+  -o PREFIX, --output PREFIX
+                        Prefix of the output file(s).
+                        Default: None
+  -h, --help            show this help message and exit
+  --version             show program's version number and exit
+"""
+```
+
+## Function Documentation
+
+If you want to integrate the MS Annika Combine Results process into your own
+scripts, you can import the following function as given:
+
+```python
+import pandas as pd
+
+cdsso = pd.read_excel("DSSO_CSMs.xlsx")
+ncdsso = pd.read_excel("ncDSSO_CSMs.xlsx")
+
+# Merging CSMs
+from msannika_merge import merge
+all_csms = merge([cdsso, ncdsso])
+
+# The function signature of merge is:
+def merge(files: List[str]) -> pd.DataFrame:
+    """code omitted"""
+    return
+```
+
+For validation please use the functions provided in [MS Annika FDR](https://github.com/hgb-bin-proteomics/MSAnnika_FDR).
+
+## Known Issues
+
+[List of known issues](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/issues)
+
+## Citing
+
+If you are using the MS Annika Combine Results script please cite:
+```
+MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity
+Micha J. Birklbauer, Manuel Matzinger, Fränze Müller, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2023 22 (9), 3009-3021
+DOI: 10.1021/acs.jproteome.3c00325
+```
+
+If you are using MS Annika please cite:
+```
+MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity
+Micha J. Birklbauer, Manuel Matzinger, Fränze Müller, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2023 22 (9), 3009-3021
+DOI: 10.1021/acs.jproteome.3c00325
+```
+or
+```
+MS Annika: A New Cross-Linking Search Engine
+Georg J. Pirklbauer, Christian E. Stieger, Manuel Matzinger, Stephan Winkler, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2021 20 (5), 2560-2569
+DOI: 10.1021/acs.jproteome.0c01000
+```
+
+## License
+
+- [MIT](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/blob/master/LICENSE)
+
+## Contact
+
+- [[email protected]](mailto:[email protected])
diff --git a/data/DSSO_CSMs.xlsx b/data/DSSO_CSMs.xlsx
diff --git a/data/ncDSSO_CSMs.xlsx b/data/ncDSSO_CSMs.xlsx
diff --git a/msannika_merge.py b/msannika_merge.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+# MS ANNIKA COMBINE RESULTS
+# 2024 (c) Micha Johannes Birklbauer
+# https://github.com/michabirklbauer/
+# [email protected]
+
+# version tracking
+__version = "1.0.0"
+__date = "2024-01-21"
+
+# REQUIREMENTS
+# pip install pandas
+# pip install openpyxl
+
+######################
+
+"""
+DESCRIPTION:
+A script to combine results from several MS Annika searches.
+USAGE:
+msannika_merge.py f [f ...]
+                    [-fdr FDR][--false_discovery_rate FDR]
+                    [-h][--help]
+                    [--version]
+positional arguments:
+  f                     MS Annika result files in Microsoft Excel format (.xlsx)
+                        to process. MS Annika result files have to be
+                        unvalidated CSMs including decoys!
+optional arguments:
+  -fdr FDR, --false_discovery_rate FDR
+                        False discovery rate to validate results for. Supports
+                        both percentage input (e.g. 1) or fraction input (e.g.
+                        0.01). By default not set and results will only be
+                        merged. Validation requires internet connection because
+                        the MS Annika FDR module will be downloaded to calculate
+                        FDR.
+                        Default: None
+  -o PREFIX, --output PREFIX
+                        Prefix of the output file(s).
+                        Default: None
+  -h, --help            show this help message and exit
+  --version             show program's version number and exit
+"""
+
+######################
+
+import argparse
+import pandas as pd
+
+from typing import List
+from typing import Dict
+
+def merge(files: List[str]) -> pd.DataFrame:
+
+    all_csms = dict()
+    columns = None
+
+    for f, file in enumerate(files):
+        df = pd.read_excel(file)
+        if columns is None:
+            columns = df.columns.tolist()
+        for i, row in df.iterrows():
+            spectrum_file = str(row["Spectrum File"])
+            scan_nr = int(row["First Scan"])
+            score = float(row["Combined Score"])
+
+            if spectrum_file in all_csms:
+                if scan_nr in all_csms[spectrum_file]:
+                    if all_csms[spectrum_file][scan_nr]["score"] < score:
+                        all_csms[spectrum_file][scan_nr] = {"row": row, "score": score}
+                else:
+                    all_csms[spectrum_file][scan_nr] = {"row": row, "score": score}
+            else:
+                all_csms[spectrum_file] = {scan_nr: {"row": row, "score": score}}
+        print(f"Processed {f + 1} CSM files...")
+
+    rows = list()
+
+    for spectrum_file in all_csms:
+        for scan_nr in all_csms[spectrum_file]:
+            rows.append(all_csms[spectrum_file][scan_nr]["row"])
+
+    return pd.concat(rows, ignore_index = True, axis = 1, names = columns).T
+
+
+def main(argv = None) -> Dict[str, pd.DataFrame]:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(metavar = "f",
+                        dest = "files",
+                        help = "Name/Path of the MS Annika CSM result files to process.",
+                        type = str,
+                        nargs = "+")
+    parser.add_argument("-fdr", "--false_discovery_rate",
+                        dest = "fdr",
+                        default = None,
+                        help = "FDR for CSM/crosslink validation.",
+                        type = float)
+    parser.add_argument("-o", "--output",
+                        dest = "output",
+                        default = None,
+                        help = "Prefix of the output file(s).",
+                        type = str)
+    parser.add_argument("--version",
+                        action = "version",
+                        version = __version)
+    args = parser.parse_args(argv)
+
+    merged_df = merge(args.files)
+
+    result_dict = {"CSMs_merged": merged_df, "CSMs_merged_validated": None,
+                   "Crosslinks": None, "Crosslinks_validated": None}
+
+    if args.output is not None:
+        merged_df.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_merged.xlsx", sheet_name = "CSMs", index = False)
+    else:
+        merged_df.to_excel("CSMs_merged.xlsx", sheet_name = "CSMs", index = False)
+
+    if args.fdr is not None:
+
+        print("Validating using MS Annika FDR...")
+
+        import urllib.request as ur
+        msannika_fdr_url = "https://raw.githubusercontent.com/hgb-bin-proteomics/MSAnnika_FDR/master/msannika_fdr.py"
+        ur.urlretrieve(msannika_fdr_url, "msannika_fdr.py")
+
+        from msannika_fdr import MSAnnika_CSM_Grouper as grouper
+        from msannika_fdr import MSAnnika_CSM_Validator as csm_val
+        from msannika_fdr import MSAnnika_Crosslink_Validator as xl_val
+
+        validated_csms = csm_val.validate(merged_df, args.fdr)
+        result_dict["CSMs_merged_validated"] = validated_csms
+        crosslinks = grouper.group(merged_df)
+        result_dict["Crosslinks"] = crosslinks
+        validated_crosslinks = xl_val.validate(crosslinks, args.fdr)
+        result_dict["Crosslinks_validated"] = validated_crosslinks
+
+        if args.output is not None:
+            validated_csms.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_merged_validated.xlsx", sheet_name = "CSMs", index = False)
+            crosslinks.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_crosslinks.xlsx", sheet_name = "Crosslinks", index = False)
+            validated_crosslinks.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_crosslinks_validated.xlsx", sheet_name = "Crosslinks", index = False)
+        else:
+            validated_csms.to_excel("CSMs_merged_validated.xlsx", sheet_name = "CSMs", index = False)
+            crosslinks.to_excel("Crosslinks.xlsx", sheet_name = "Crosslinks", index = False)
+            validated_crosslinks.to_excel("Crosslinks_validated.xlsx", sheet_name = "Crosslinks", index = False)
+
+    print("Done!")
+    return result_dict
+
+if __name__ == "__main__":
+    r = main()