Skip to content

Commit

Permalink
Modularize io (#47)
Browse files Browse the repository at this point in the history
* if run step is manually aborted it can be restarted without recomputing what has already been computed. Needs more testing and works only if post_process has not started yet, but seems to do the tick

* added somewhat dangerous cleanup, does the job but make it safer

* finished modularization in theory: only not run ppkts are run, ontogpt output is appended, the full dataframe save location has been moved, plots and data dir spearated. BUT all of this needs to be tested

* lifted buggy is_file(), tested adding a model, then adding a phenopacket, in both cases successful run

* added possibility of ppkts not being correctly run. If output file is present but empty, rerun it. Tested.

* significant cleanup of run with Daniel and Peter, input is now manually given thru csv file, extended analysis significantly

* added safe saving of dfs and some analysis scripts
  • Loading branch information
leokim-l committed Sep 3, 2024
1 parent 5c3ca68 commit 070f74c
Show file tree
Hide file tree
Showing 12 changed files with 373 additions and 144 deletions.
67 changes: 67 additions & 0 deletions src/malco/analysis/check_lens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
from typing import List

import pandas as pd
import yaml
#from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
import sys

def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.
Args:
raw_result_path(Path): Path to the raw result file.
Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list

unique_ppkts = {}
#model=str(sys.argv[1])
models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
for model in models:
print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)

yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
all_results=read_raw_result_yaml(yamlfile)

counter = 0
labelvec = []

# Cannot have further files in raw_result_path!
for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
labelvec.append(label)
terms = extracted_object.get('terms')
if terms:
counter += 1

full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
df = pd.read_csv(full_df_file, sep='\t')
num_ppkts = df['label'].nunique()
unique_ppkts[model] = df['label'].unique()
# The first should be equivalent to grepping "raw_" in some results.yaml
print("The number of prompts that have something in results.yaml are: ", len(labelvec))
print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")

# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
# Thus, let's print out what is missing in the others
for i in unique_ppkts["gpt-4-turbo"]:
if i in unique_ppkts["gpt-4"]:
continue
else:
print(f"Missing ppkt in gpt-4 is:\t", i)
print("\n")

for i in unique_ppkts["gpt-4-turbo"]:
if i in unique_ppkts["gpt-3.5-turbo"]:
continue
else:
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
20 changes: 20 additions & 0 deletions src/malco/analysis/check_shelved_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# check shelved cache. Can maxsize be changed at a later point in time?

from cachetools import LRUCache
from cachetools.keys import hashkey
from shelved_cache import PersistentCache

file_name = "test_increasing_cache"

pc = PersistentCache(LRUCache, file_name, maxsize=4096)

pc["a"] = 42

pc.close()
breakpoint()

pc2 = PersistentCache(LRUCache, file_name, maxsize=16384)

breakpoint()
pc2.close()

55 changes: 40 additions & 15 deletions src/malco/analysis/eval_diagnose_category.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import pandas as pd
import numpy as np
import sys

from oaklib.datamodels.vocabulary import IS_A, PART_OF
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces import OboGraphInterface
from oaklib.interfaces.obograph_interface import GraphTraversalMethod

from oaklib import get_adapter

from cachetools import cached, LRUCache
from cachetools.keys import hashkey
from shelved_cache import PersistentCache

pc_cache_file = "trial_diagnose_cache"
pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)


def mondo_adapter() -> OboGraphInterface:
"""
Expand All @@ -19,55 +26,67 @@ def mondo_adapter() -> OboGraphInterface:
return get_adapter("sqlite:obo:mondo")

def mondo_mapping(term, adapter):
print(term)
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos

@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
raise ValueError("Adapter is not an MappingProviderInterface")
# What is best algorithm to avoid traversing the mondo graph a billion times?
raise ValueError("Adapter is not a MappingProviderInterface")
# Find ancestors
mondo_term = mondo_mapping(omim_term, mondo)
if not mondo_term:
print(omim_term)
return None

ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT

for mondo_ancestor in ancestor_list:
if mondo_ancestor in disease_categories:
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)

print("Special issue following: ")
print(omim_term)


#=====================================================
# script starts here
# Find 42 diseases categories
#=====================================================

mondo = mondo_adapter()
disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])

# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
header = ["label","correct", "incorrect"]
#header = ["diseases_category", "correct", "incorrect"]
dc_list = [i[0] for i in list(disease_categories)]
#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
#dc_labels = []
for j in dc_list:
contingency_table.loc[j,"label"] = mondo.label(j)


# example path of full results
filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"

model=str(sys.argv[1])
filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
# label term score rank correct_term is_correct reciprocal_rank
# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0

df = pd.read_csv(
filename, sep="\t" #, header=None, names=["description", "term", "label"]
filename, sep="\t"
)

ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]
count_fails=0

omim_wo_match = {}
for ppkt in ppkts:
# find this phenopackets category <cat> from OMIM
category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
if not category_index:
count_fails += 1
#print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
continue
#cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
if not any(ppkt[1]["is_correct"]):
Expand All @@ -77,5 +96,11 @@ def find_category(omim_term, disease_categories, mondo):
# yes --> increase <cat> correct
contingency_table.loc[category_index, "correct"] += 1

print(contingency_table)

print("\n\n", "==="*15,"\n")
print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
#print(contingency_table)
print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))

cont_table_file = f"disease_groups/{model}.tsv"
# Will overwrite
#contingency_table.to_csv(cont_table_file, sep='\t')
15 changes: 15 additions & 0 deletions src/malco/post_process/df_save_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import shutil
import os
import pandas as pd

def safe_save_tsv(path, filename, df):
full_path = path / filename
# If full_path already exists, prepend "old_"
# It's the user's responsibility to know only up to 2 versions can exist, then data is lost
if os.path.isfile(full_path):
old_full_path = path / ("old_" + filename)
if os.path.isfile(old_full_path):
os.remove(old_full_path)
shutil.copy(full_path, old_full_path)
os.remove(full_path)
df.to_csv(full_path, sep='\t', index=False)
18 changes: 5 additions & 13 deletions src/malco/post_process/generate_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

# Make a nice plot, use it as function or as script

def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
def make_plots(mrr_file, data_dir, languages, num_ppkt, models, topn_aggr_file, comparing):
plot_dir = data_dir.parents[0] / "plots"
plot_dir.mkdir(exist_ok=True)

if comparing=="model":
name_string = str(len(models))
else:
Expand All @@ -30,19 +33,8 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, compa
plt.close()

# Plotting bar-plots with top<n> ranks
df = pd.read_csv(topn_file, delimiter='\t')
df["top1"] = df['n1']
df["top3"] = df["n1"] + df["n2"] + df["n3"]
df["top5"] = df["top3"] + df["n4"] + df["n5"]
df["top10"] = df["top5"] + df["n6"] + df["n7"] + df["n8"] + df["n9"] + df["n10"]
df["not_found"] = df["nf"]
df_aggr = pd.read_csv(topn_aggr_file, delimiter='\t')

df_aggr = pd.DataFrame()
df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
bar_data_file = plot_dir / "topn_aggr.tsv"
df_aggr.to_csv(bar_data_file, sep='\t', index=False)

sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)

plt.xlabel("Number of Ranks in")
Expand Down
2 changes: 1 addition & 1 deletion src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
output_dir=output_lang, output_file_name="results.tsv")
'''

#TODO should this duplicated code a single code with a parameter?
for model in models:
raw_results_model = raw_results_dir / "multimodel" / model
output_model = output_dir / "multimodel" / model
Expand Down
13 changes: 9 additions & 4 deletions src/malco/post_process/post_process_results_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import os
from pathlib import Path
from typing import List

import shutil
import pandas as pd
import yaml
from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
from pheval.utils.file_utils import all_files
from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict
from malco.post_process.df_save_util import safe_save_tsv



def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
Expand All @@ -21,14 +23,15 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result)) # Load and convert to list
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list


def create_standardised_results(raw_results_dir: Path, output_dir: Path,
output_file_name: str) -> pd.DataFrame:
data = []
for raw_result_path in raw_results_dir.iterdir():
if raw_result_path.is_file():
# Cannot have further files in raw_result_path!
all_results = read_raw_result_yaml(raw_result_path)

for this_result in all_results:
Expand All @@ -37,6 +40,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
label = extracted_object.get('label')
terms = extracted_object.get('terms')
if terms:
# Note, the if allows for rerunning ppkts that failed due to connection issues
# We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
num_terms = len(terms)
score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
rank_list = [ i+1 for i in range(num_terms)]
Expand All @@ -47,8 +52,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
df = pd.DataFrame(data)

# Save DataFrame to TSV
output_path = output_dir / output_file_name
df.to_csv(output_path, sep='\t', index=False)
# output_path = output_dir / output_file_name
safe_save_tsv(output_dir, output_file_name, df)

return df

Expand Down
Loading

0 comments on commit 070f74c

Please sign in to comment.