hgb-bin-proteomics · michabirklbauer · Jan 21, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,47 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# Reference workflow provided by (c) GitHub
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: msannika_merge
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v3
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Copy scripts and data to "/tests"
+ run: |
+ cp msannika_merge.py tests
+ cp data/DSSO_CSMs.xlsx .
+ cp data/ncDSSO_CSMs.xlsx .
+ wget https://raw.githubusercontent.com/hgb-bin-proteomics/MSAnnika_FDR/master/msannika_fdr.py
+ cp msannika_fdr.py tests
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install flake8 pytest
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+ - name: Lint with flake8
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ run: |
+ pytest tests/tests.py
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,28 @@
+cff-version: 1.2.0
+preferred-citation:
+ type: article
+ authors:
+ - family-names: "Birklbauer"
+ given-names: "Micha J."
+ orcid: "https://orcid.org/0009-0005-1051-179X"
+ - family-names: "Matzinger"
+ given-names: "Manuel"
+ orcid: "https://orcid.org/0000-0002-9765-7951"
+ - family-names: "Müller"
+ given-names: "Fränze"
+ orcid: "https://orcid.org/0000-0003-3764-3547"
+ - family-names: "Mechtler"
+ given-names: "Karl"
+ orcid: "https://orcid.org/0000-0002-3392-9946"
+ - family-names: "Dorfer"
+ given-names: "Viktoria"
+ orcid: "https://orcid.org/0000-0002-5332-5701"
+ doi: "10.1021/acs.jproteome.3c00325"
+ journal: "Journal of Proteome Research"
+ month: 9
+ start: 3009
+ end: 3021
+ title: "MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity"
+ issue: 9
+ volume: 22
+ year: 2023
diff --git a/README.md b/README.md
@@ -1 +1,167 @@
-# MSAnnika_Combine_Results
+![workflow_state](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/workflows/msannika_merge/badge.svg)
+
+# MS Annika Combine Results
+
+A script to merge and optionally validate several [MS Annika](https://github.com/hgb-bin-proteomics/MSAnnika)
+search results. The main use case would be for merging results from different MS
+Annika runs, e.g. combining results from a cleavable and non-cleavable MS Annika
+search or combining results from different doublet distances.
+
+## Usage
+
+- Install python 3.7+: [https://www.python.org/downloads/](https://www.python.org/downloads/)
+- Install requirements: `pip install -r requirements.txt`
+- Export MS Annika CSM results from Proteome Discoverer to Microsoft Excel format.
+ - **Important:** CSMs should not be filtered! Export all (unvalidated) CSMs including decoy hits!
+- Run `python msannika_merge.py filename1.xlsx filename2.xlsx -fdr 0.01` (see below for more examples).
+- The script may take a few minutes, depending on the number of CSMs to process.
+- Done!
+
+## Examples
+
+`msannika_merge.py` takes one positional and two optional arguments. The first
+argument always has to be the filename(s) of the MS Annika CSM result file(s).
+You may specify any number of result files! For demonstration purposes we will
+use the files supplied in the `/data` folder:
+- `DSSO_CSMs.xlsx` contains unvalidated CSMs from a cleavable MS Annika search
+using the crosslinker DSSO.
+- `ncDSSO_CSMs.xlsx` contains unvalidated CSMs from a non-cleavable MS Annika
+search using the crosslinker DSSO.
+
+The following is a valid `msannika_merge.py` call:
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx
+```
+
+This will merge CSMs from all given files, in this case `DSSO_CSMs.xlsx` and
+`ncDSSO_CSMs.xlsx` into a result file called `CSMs_merged.xlsx`. You can also
+set a prefix for the generated result file(s) like this:
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -o All_CSMs.xlsx
+```
+
+This will merge CSMs from all given files, exactly like the last command, but
+the generated result file will now be named `All_CSMs_merged.xlsx`.
+
+If you suppy the optional argument `-fdr` or `--false_discovery_rate` and the
+desired FDR as a floating point number, the CSMs will be merged, then validated,
+then grouped by sequence and position to crosslinks and those crosslinks will
+again be validated for the given FDR. To group CSMs and validate CSMs and
+crosslinks the [MS Annika FDR](https://github.com/hgb-bin-proteomics/MSAnnika_FDR)
+script is downloaded and used. Validation therefore requires an active internet
+connection!
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -fdr 0.01
+```
+
+This will merge CSMs from all given files, then validate the merged CSMs for
+estimated 1% FDR, then group CSMs to crosslinks and finally validate the
+crosslinks for estimated 1% FDR. The following files will be generated:
+- `CSMs_merged.xlsx`: The merged CSMs from all given files.
+- `CSMs_merged_validated.xlsx`: The merged CSMs that are above the estimated 1%
+FDR threshold.
+- `Crosslinks.xlsx`: The crosslinks that result from grouping the merged CSMs.
+- `Crosslinks_validated.xlsx`: The crosslinks that are above the estimated 1%
+FDR threshold.
+
+Note that the following command will produce the same output (FDR values >= 1
+will automatically be divided by 100):
+
+```bash
+python msannika_merge.py DSSO_CSMs.xlsx ncDSSO_CSMs.xlsx -fdr 1
+```
+
+## Parameters
+
+```python
+"""
+DESCRIPTION:
+A script to combine results from several MS Annika searches.
+USAGE:
+msannika_merge.py f [f ...]
+ [-fdr FDR][--false_discovery_rate FDR]
+ [-h][--help]
+ [--version]
+positional arguments:
+ f MS Annika result files in Microsoft Excel format (.xlsx)
+ to process. MS Annika result files have to be
+ unvalidated CSMs including decoys!
+optional arguments:
+ -fdr FDR, --false_discovery_rate FDR
+ False discovery rate to validate results for. Supports
+ both percentage input (e.g. 1) or fraction input (e.g.
+ 0.01). By default not set and results will only be
+ merged. Validation requires internet connection because
+ the MS Annika FDR module will be downloaded to calculate
+ FDR.
+ Default: None
+ -o PREFIX, --output PREFIX
+ Prefix of the output file(s).
+ Default: None
+ -h, --help show this help message and exit
+ --version show program's version number and exit
+"""
+```
+
+## Function Documentation
+
+If you want to integrate the MS Annika Combine Results process into your own
+scripts, you can import the following function as given:
+
+```python
+import pandas as pd
+
+cdsso = pd.read_excel("DSSO_CSMs.xlsx")
+ncdsso = pd.read_excel("ncDSSO_CSMs.xlsx")
+
+# Merging CSMs
+from msannika_merge import merge
+all_csms = merge([cdsso, ncdsso])
+
+# The function signature of merge is:
+def merge(files: List[str]) -> pd.DataFrame:
+ """code omitted"""
+ return
+```
+
+For validation please use the functions provided in [MS Annika FDR](https://github.com/hgb-bin-proteomics/MSAnnika_FDR).
+
+## Known Issues
+
+[List of known issues](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/issues)
+
+## Citing
+
+If you are using the MS Annika Combine Results script please cite:
+```
+MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity
+Micha J. Birklbauer, Manuel Matzinger, Fränze Müller, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2023 22 (9), 3009-3021
+DOI: 10.1021/acs.jproteome.3c00325
+```
+
+If you are using MS Annika please cite:
+```
+MS Annika 2.0 Identifies Cross-Linked Peptides in MS2–MS3-Based Workflows at High Sensitivity and Specificity
+Micha J. Birklbauer, Manuel Matzinger, Fränze Müller, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2023 22 (9), 3009-3021
+DOI: 10.1021/acs.jproteome.3c00325
+```
+or
+```
+MS Annika: A New Cross-Linking Search Engine
+Georg J. Pirklbauer, Christian E. Stieger, Manuel Matzinger, Stephan Winkler, Karl Mechtler, and Viktoria Dorfer
+Journal of Proteome Research 2021 20 (5), 2560-2569
+DOI: 10.1021/acs.jproteome.0c01000
+```
+
+## License
+
+- [MIT](https://github.com/hgb-bin-proteomics/MSAnnika_Combine_Results/blob/master/LICENSE)
+
+## Contact
+
+- [[email protected]](mailto:[email protected])
diff --git a/data/DSSO_CSMs.xlsx b/data/DSSO_CSMs.xlsx
diff --git a/data/ncDSSO_CSMs.xlsx b/data/ncDSSO_CSMs.xlsx
diff --git a/msannika_merge.py b/msannika_merge.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+# MS ANNIKA COMBINE RESULTS
+# 2024 (c) Micha Johannes Birklbauer
+# https://github.com/michabirklbauer/
+# [email protected]
+
+# version tracking
+__version = "1.0.0"
+__date = "2024-01-21"
+
+# REQUIREMENTS
+# pip install pandas
+# pip install openpyxl
+
+######################
+
+"""
+DESCRIPTION:
+A script to combine results from several MS Annika searches.
+USAGE:
+msannika_merge.py f [f ...]
+ [-fdr FDR][--false_discovery_rate FDR]
+ [-h][--help]
+ [--version]
+positional arguments:
+ f MS Annika result files in Microsoft Excel format (.xlsx)
+ to process. MS Annika result files have to be
+ unvalidated CSMs including decoys!
+optional arguments:
+ -fdr FDR, --false_discovery_rate FDR
+ False discovery rate to validate results for. Supports
+ both percentage input (e.g. 1) or fraction input (e.g.
+ 0.01). By default not set and results will only be
+ merged. Validation requires internet connection because
+ the MS Annika FDR module will be downloaded to calculate
+ FDR.
+ Default: None
+ -o PREFIX, --output PREFIX
+ Prefix of the output file(s).
+ Default: None
+ -h, --help show this help message and exit
+ --version show program's version number and exit
+"""
+
+######################
+
+import argparse
+import pandas as pd
+
+from typing import List
+from typing import Dict
+
+def merge(files: List[str]) -> pd.DataFrame:
+
+ all_csms = dict()
+ columns = None
+
+ for f, file in enumerate(files):
+ df = pd.read_excel(file)
+ if columns is None:
+ columns = df.columns.tolist()
+ for i, row in df.iterrows():
+ spectrum_file = str(row["Spectrum File"])
+ scan_nr = int(row["First Scan"])
+ score = float(row["Combined Score"])
+
+ if spectrum_file in all_csms:
+ if scan_nr in all_csms[spectrum_file]:
+ if all_csms[spectrum_file][scan_nr]["score"] < score:
+ all_csms[spectrum_file][scan_nr] = {"row": row, "score": score}
+ else:
+ all_csms[spectrum_file][scan_nr] = {"row": row, "score": score}
+ else:
+ all_csms[spectrum_file] = {scan_nr: {"row": row, "score": score}}
+ print(f"Processed {f + 1} CSM files...")
+
+ rows = list()
+
+ for spectrum_file in all_csms:
+ for scan_nr in all_csms[spectrum_file]:
+ rows.append(all_csms[spectrum_file][scan_nr]["row"])
+
+ return pd.concat(rows, ignore_index = True, axis = 1, names = columns).T
+
+
+def main(argv = None) -> Dict[str, pd.DataFrame]:
+ parser = argparse.ArgumentParser()
+ parser.add_argument(metavar = "f",
+ dest = "files",
+ help = "Name/Path of the MS Annika CSM result files to process.",
+ type = str,
+ nargs = "+")
+ parser.add_argument("-fdr", "--false_discovery_rate",
+ dest = "fdr",
+ default = None,
+ help = "FDR for CSM/crosslink validation.",
+ type = float)
+ parser.add_argument("-o", "--output",
+ dest = "output",
+ default = None,
+ help = "Prefix of the output file(s).",
+ type = str)
+ parser.add_argument("--version",
+ action = "version",
+ version = __version)
+ args = parser.parse_args(argv)
+
+ merged_df = merge(args.files)
+
+ result_dict = {"CSMs_merged": merged_df, "CSMs_merged_validated": None,
+ "Crosslinks": None, "Crosslinks_validated": None}
+
+ if args.output is not None:
+ merged_df.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_merged.xlsx", sheet_name = "CSMs", index = False)
+ else:
+ merged_df.to_excel("CSMs_merged.xlsx", sheet_name = "CSMs", index = False)
+
+ if args.fdr is not None:
+
+ print("Validating using MS Annika FDR...")
+
+ import urllib.request as ur
+ msannika_fdr_url = "https://raw.githubusercontent.com/hgb-bin-proteomics/MSAnnika_FDR/master/msannika_fdr.py"
+ ur.urlretrieve(msannika_fdr_url, "msannika_fdr.py")
+
+ from msannika_fdr import MSAnnika_CSM_Grouper as grouper
+ from msannika_fdr import MSAnnika_CSM_Validator as csm_val
+ from msannika_fdr import MSAnnika_Crosslink_Validator as xl_val
+
+ validated_csms = csm_val.validate(merged_df, args.fdr)
+ result_dict["CSMs_merged_validated"] = validated_csms
+ crosslinks = grouper.group(merged_df)
+ result_dict["Crosslinks"] = crosslinks
+ validated_crosslinks = xl_val.validate(crosslinks, args.fdr)
+ result_dict["Crosslinks_validated"] = validated_crosslinks
+
+ if args.output is not None:
+ validated_csms.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_merged_validated.xlsx", sheet_name = "CSMs", index = False)
+ crosslinks.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_crosslinks.xlsx", sheet_name = "Crosslinks", index = False)
+ validated_crosslinks.to_excel(".xlsx".join(args.output.split(".xlsx")[:-1]) + "_crosslinks_validated.xlsx", sheet_name = "Crosslinks", index = False)
+ else:
+ validated_csms.to_excel("CSMs_merged_validated.xlsx", sheet_name = "CSMs", index = False)
+ crosslinks.to_excel("Crosslinks.xlsx", sheet_name = "Crosslinks", index = False)
+ validated_crosslinks.to_excel("Crosslinks_validated.xlsx", sheet_name = "Crosslinks", index = False)
+
+ print("Done!")
+ return result_dict
+
+if __name__ == "__main__":
+ r = main()