Skip to content

Commit

Permalink
protinter now can be installed as a CLI app via a new setup.py instal…
Browse files Browse the repository at this point in the history
…lator
  • Loading branch information
yayekit committed Sep 7, 2024
1 parent c19d645 commit c3c4589
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 24 deletions.
66 changes: 42 additions & 24 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,49 @@
from ensemble import create_ensemble
from visualization import plot_feature_importance, plot_confusion_matrix, plot_correlation_matrix

#!/usr/bin/env python3
import argparse
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from features import extract_features
from model import load_model, predict_new_data
import numpy as np

def main():
positive_file = "positive_interactions.fasta"
negative_file = "negative_interactions.fasta"

X, y, feature_names = load_and_preprocess_data(positive_file, negative_file)

# Data augmentation
X_augmented, y_augmented = augment_data(X, y)

# Feature selection
X_selected, selected_indices = select_features(X_augmented, y_augmented, k=50)

# Hyperparameter optimization
best_params = optimize_hyperparameters(X_selected, y_augmented)

# Model training with optimized parameters
model, scaler = train_model_cv(X_selected, y_augmented, params=best_params)

# Model interpretation
explain_model(model, X_selected, [feature_names[i] for i in selected_indices])

# ... rest of the code ...

# Save model and scaler
save_model(model, scaler, "protein_interaction_model.joblib")
parser = argparse.ArgumentParser(description="Predict protein interactions.")
parser.add_argument("sequence_file", help="Path to the protein sequence file (FASTA format)")
args = parser.parse_args()

try:
# Load the sequence
with open(args.sequence_file, "r") as handle:
record = next(SeqIO.parse(handle, "fasta"))
sequence = record.seq
except FileNotFoundError:
print(f"Error: File '{args.sequence_file}' not found.")
sys.exit(1)
except StopIteration:
print(f"Error: No sequences found in '{args.sequence_file}'.")
sys.exit(1)

# Extract features
features = extract_features(sequence)
X_new = np.array([list(features.values())])

# Load the pre-trained model
try:
model, scaler = load_model("protein_interaction_model.joblib")
except FileNotFoundError:
print("Error: Pre-trained model not found. Please ensure 'protein_interaction_model.joblib' is in the current directory.")
sys.exit(1)

# Make prediction
prediction = predict_new_data(model, scaler, X_new)

# Print result
result = "likely to interact" if prediction[0] == 1 else "unlikely to interact"
print(f"The protein sequence in '{args.sequence_file}' is {result}.")

if __name__ == "__main__":
main()
23 changes: 23 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from setuptools import setup, find_packages

setup(
name="prot",
version="0.1",
packages=find_packages(),
install_requires=[
"biopython",
"numpy",
"scikit-learn",
"xgboost",
"joblib",
],
entry_points={
"console_scripts": [
"prot=main:main",
],
},
include_package_data=True,
package_data={
"": ["protein_interaction_model.joblib"],
},
)

0 comments on commit c3c4589

Please sign in to comment.