Modularize io (#47)

* if run step is manually aborted it can be restarted without recomputing what has already been computed. Needs more testing and works only if post_process has not started yet, but seems to do the tick * added somewhat dangerous cleanup, does the job but make it safer * finished modularization in theory: only not run ppkts are run, ontogpt output is appended, the full dataframe save location has been moved, plots and data dir spearated. BUT all of this needs to be tested * lifted buggy is_file(), tested adding a model, then adding a phenopacket, in both cases successful run * added possibility of ppkts not being correctly run. If output file is present but empty, rerun it. Tested. * significant cleanup of run with Daniel and Peter, input is now manually given thru csv file, extended analysis significantly * added safe saving of dfs and some analysis scripts
monarch-initiative · Sep 3, 2024 · 070f74c · 070f74c
1 parent 5c3ca68
commit 070f74c
Show file tree

Hide file tree

Showing 12 changed files with 373 additions and 144 deletions.
diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -0,0 +1,67 @@
+import pandas as pd 
+from typing import List
+
+import pandas as pd
+import yaml
+#from malco.post_process.post_process_results_format import read_raw_result_yaml
+from pathlib import Path
+import sys
+
+def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
+ """
+ Read the raw result file.
+
+ Args:
+ raw_result_path(Path): Path to the raw result file.
+
+ Returns:
+ dict: Contents of the raw result file.
+ """
+ with open(raw_result_path, 'r') as raw_result:
+ return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
+
+unique_ppkts = {}
+#model=str(sys.argv[1])
+models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
+for model in models:
+ print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
+
+ yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
+ all_results=read_raw_result_yaml(yamlfile)
+
+ counter = 0
+ labelvec = []
+
+ # Cannot have further files in raw_result_path!
+ for this_result in all_results:
+ extracted_object = this_result.get("extracted_object")
+ if extracted_object:
+ label = extracted_object.get('label')
+ labelvec.append(label)
+ terms = extracted_object.get('terms')
+ if terms:
+ counter += 1
+
+ full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
+ df = pd.read_csv(full_df_file, sep='\t')
+ num_ppkts = df['label'].nunique()
+ unique_ppkts[model] = df['label'].unique()
+ # The first should be equivalent to grepping "raw_" in some results.yaml
+ print("The number of prompts that have something in results.yaml are: ", len(labelvec))
+ print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
+ print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
+
+# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
+# Thus, let's print out what is missing in the others
+for i in unique_ppkts["gpt-4-turbo"]:
+ if i in unique_ppkts["gpt-4"]:
+ continue
+ else:
+ print(f"Missing ppkt in gpt-4 is:\t", i)
+print("\n")
+
+for i in unique_ppkts["gpt-4-turbo"]:
+ if i in unique_ppkts["gpt-3.5-turbo"]:
+ continue
+ else:
+ print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
diff --git a/src/malco/analysis/check_shelved_cache.py b/src/malco/analysis/check_shelved_cache.py
@@ -0,0 +1,20 @@
+# check shelved cache. Can maxsize be changed at a later point in time?
+
+from cachetools import LRUCache
+from cachetools.keys import hashkey
+from shelved_cache import PersistentCache
+
+file_name = "test_increasing_cache"
+
+pc = PersistentCache(LRUCache, file_name, maxsize=4096) 
+
+pc["a"] = 42
+
+pc.close()
+breakpoint()
+
+pc2 = PersistentCache(LRUCache, file_name, maxsize=16384) 
+
+breakpoint()
+pc2.close()
+
diff --git a/src/malco/analysis/eval_diagnose_category.py b/src/malco/analysis/eval_diagnose_category.py
@@ -1,13 +1,20 @@
 import pandas as pd
 import numpy as np
+import sys
 
 from oaklib.datamodels.vocabulary import IS_A, PART_OF
 from oaklib.interfaces import MappingProviderInterface
 from oaklib.interfaces import OboGraphInterface
 from oaklib.interfaces.obograph_interface import GraphTraversalMethod
-
 from oaklib import get_adapter
 
+from cachetools import cached, LRUCache
+from cachetools.keys import hashkey
+from shelved_cache import PersistentCache
+
+pc_cache_file = "trial_diagnose_cache"
+pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096) 
+
 
 def mondo_adapter() -> OboGraphInterface:
  """
@@ -19,55 +26,67 @@ def mondo_adapter() -> OboGraphInterface:
  return get_adapter("sqlite:obo:mondo") 
 
 def mondo_mapping(term, adapter): 
- print(term)
  mondos = []
  for m in adapter.sssom_mappings([term], source="OMIM"):
  if m.predicate_id == "skos:exactMatch":
  mondos.append(m.subject_id)
  return mondos
 
+@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
 def find_category(omim_term, disease_categories, mondo):
  if not isinstance(mondo, MappingProviderInterface):
- raise ValueError("Adapter is not an MappingProviderInterface")
- # What is best algorithm to avoid traversing the mondo graph a billion times? 
+ raise ValueError("Adapter is not a MappingProviderInterface")
  # Find ancestors
  mondo_term = mondo_mapping(omim_term, mondo)
+ if not mondo_term:
+ print(omim_term)
+ return None
+
  ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
 
  for mondo_ancestor in ancestor_list:
  if mondo_ancestor in disease_categories:
  return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)
+
+ print("Special issue following: ")
+ print(omim_term)
 
-
+#=====================================================
+# script starts here
 # Find 42 diseases categories
+#=====================================================
+
 mondo = mondo_adapter()
 disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])
+
 # make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
 header = ["label","correct", "incorrect"]
-#header = ["diseases_category", "correct", "incorrect"]
 dc_list = [i[0] for i in list(disease_categories)]
-#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
 contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
-#dc_labels = []
 for j in dc_list:
  contingency_table.loc[j,"label"] = mondo.label(j)
 
-
-# example path of full results
-filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"
-
+model=str(sys.argv[1])
+filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
 # label term score rank correct_term is_correct reciprocal_rank
 # PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0
 
 df = pd.read_csv(
- filename, sep="\t" #, header=None, names=["description", "term", "label"]
+ filename, sep="\t" 
  )
 
 ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]] 
+count_fails=0
 
+omim_wo_match = {}
 for ppkt in ppkts:
  # find this phenopackets category <cat> from OMIM
  category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
+ if not category_index:
+ count_fails += 1
+ #print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
+ omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
+ continue
  #cat_ind = find_cat_index(category)
  # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe 
  if not any(ppkt[1]["is_correct"]):
@@ -77,5 +96,11 @@ def find_category(omim_term, disease_categories, mondo):
  # yes --> increase <cat> correct
  contingency_table.loc[category_index, "correct"] += 1
 
-print(contingency_table)
-
+print("\n\n", "==="*15,"\n")
+print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
+#print(contingency_table)
+print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
+
+cont_table_file = f"disease_groups/{model}.tsv"
+# Will overwrite
+#contingency_table.to_csv(cont_table_file, sep='\t')
diff --git a/src/malco/post_process/df_save_util.py b/src/malco/post_process/df_save_util.py
@@ -0,0 +1,15 @@
+import shutil
+import os
+import pandas as pd
+
+def safe_save_tsv(path, filename, df):
+ full_path = path / filename
+ # If full_path already exists, prepend "old_"
+ # It's the user's responsibility to know only up to 2 versions can exist, then data is lost
+ if os.path.isfile(full_path):
+ old_full_path = path / ("old_" + filename)
+ if os.path.isfile(old_full_path):
+ os.remove(old_full_path)
+ shutil.copy(full_path, old_full_path)
+ os.remove(full_path)
+ df.to_csv(full_path, sep='\t', index=False)
diff --git a/src/malco/post_process/generate_plots.py b/src/malco/post_process/generate_plots.py
@@ -6,7 +6,10 @@
 
 # Make a nice plot, use it as function or as script
 
-def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
+def make_plots(mrr_file, data_dir, languages, num_ppkt, models, topn_aggr_file, comparing):
+ plot_dir = data_dir.parents[0] / "plots"
+ plot_dir.mkdir(exist_ok=True)
+
  if comparing=="model":
  name_string = str(len(models))
  else:
@@ -30,19 +33,8 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, compa
  plt.close()
 
  # Plotting bar-plots with top<n> ranks
- df = pd.read_csv(topn_file, delimiter='\t')
- df["top1"] = df['n1']
- df["top3"] = df["n1"] + df["n2"] + df["n3"]
- df["top5"] = df["top3"] + df["n4"] + df["n5"]
- df["top10"] = df["top5"] + df["n6"] + df["n7"] + df["n8"] + df["n9"] + df["n10"]
- df["not_found"] = df["nf"]
+ df_aggr = pd.read_csv(topn_aggr_file, delimiter='\t')
 
- df_aggr = pd.DataFrame()
- df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
- df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
- bar_data_file = plot_dir / "topn_aggr.tsv"
- df_aggr.to_csv(bar_data_file, sep='\t', index=False)
-
  sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)
 
  plt.xlabel("Number of Ranks in")

diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -24,7 +24,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
  output_dir=output_lang, output_file_name="results.tsv")
 
  '''
-
+ #TODO should this duplicated code a single code with a parameter?
  for model in models:
  raw_results_model = raw_results_dir / "multimodel" / model
  output_model = output_dir / "multimodel" / model

diff --git a/src/malco/post_process/post_process_results_format.py b/src/malco/post_process/post_process_results_format.py
@@ -2,12 +2,14 @@
 import os
 from pathlib import Path
 from typing import List
-
+import shutil
 import pandas as pd
 import yaml
 from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
 from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict
+from malco.post_process.df_save_util import safe_save_tsv
+
 
 
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
@@ -21,14 +23,15 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
  dict: Contents of the raw result file.
  """
  with open(raw_result_path, 'r') as raw_result:
- return list(yaml.safe_load_all(raw_result)) # Load and convert to list
+ return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list
 
 
 def create_standardised_results(raw_results_dir: Path, output_dir: Path,
  output_file_name: str) -> pd.DataFrame:
  data = []
  for raw_result_path in raw_results_dir.iterdir():
  if raw_result_path.is_file():
+ # Cannot have further files in raw_result_path!
  all_results = read_raw_result_yaml(raw_result_path)
 
  for this_result in all_results:
@@ -37,6 +40,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
  label = extracted_object.get('label')
  terms = extracted_object.get('terms')
  if terms:
+ # Note, the if allows for rerunning ppkts that failed due to connection issues
+ # We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
  num_terms = len(terms)
  score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
  rank_list = [ i+1 for i in range(num_terms)]
@@ -47,8 +52,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
  df = pd.DataFrame(data)
 
  # Save DataFrame to TSV
- output_path = output_dir / output_file_name
- df.to_csv(output_path, sep='\t', index=False)
+ # output_path = output_dir / output_file_name
+ safe_save_tsv(output_dir, output_file_name, df)
 
  return df