Persist cache (#44)

* fixed most issues * finished persisting cache
monarch-initiative · Aug 15, 2024 · 5c3ca68 · 5c3ca68
1 parent 7c03bd2
commit 5c3ca68
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 41 deletions.
diff --git a/src/malco/post_process/generate_plots.py b/src/malco/post_process/generate_plots.py
@@ -48,7 +48,6 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, compa
  plt.xlabel("Number of Ranks in")
  plt.ylabel("Percentage of Cases")
  plt.title("Rank Comparison for Differential Diagnosis")
- breakpoint()
  plt.legend(title=comparing)
  plot_path = plot_dir / ("barplot_" + name_string + "_" + comparing + "_" + str(num_ppkt) + "ppkt.png")
  plt.savefig(plot_path)

diff --git a/src/malco/post_process/mondo_score_utils.py b/src/malco/post_process/mondo_score_utils.py
@@ -1,16 +1,15 @@
 from oaklib.datamodels.vocabulary import IS_A
 from oaklib.interfaces import MappingProviderInterface
+from pathlib import Path
 
 from typing import List 
-from cachetools import cached, LRUCache
 from cachetools.keys import hashkey
 
 
-
 FULL_SCORE = 1.0
 PARTIAL_SCORE = 0.5
 
-@cached(cache=LRUCache(maxsize=16384), info=True, key=lambda term, adapter: hashkey(term))
+
 def omim_mappings(term: str, adapter) -> List[str]: 
  """
  Get the OMIM mappings for a term.
@@ -35,8 +34,7 @@ def omim_mappings(term: str, adapter) -> List[str]:
  return omims
 
 
-@cached(cache=LRUCache(maxsize=4096), info=True, key=lambda prediction, ground_truth, mondo: hashkey(prediction, ground_truth))
-def score_grounded_result(prediction: str, ground_truth: str, mondo) -> float:
+def score_grounded_result(prediction: str, ground_truth: str, mondo, cache=None) -> float:
  """
  Score the grounded result.
 
@@ -72,15 +70,36 @@ def score_grounded_result(prediction: str, ground_truth: str, mondo) -> float:
  # predication is the correct OMIM
  return FULL_SCORE
 
- if ground_truth in omim_mappings(prediction, mondo):
+
+ ground_truths = get_ground_truth_from_cache_or_compute(prediction, mondo, cache)
+ if ground_truth in ground_truths:
  # prediction is a MONDO that directly maps to a correct OMIM
  return FULL_SCORE
 
  descendants_list = mondo.descendants([prediction], predicates=[IS_A], reflexive=True)
  for mondo_descendant in descendants_list:
- if ground_truth in omim_mappings(mondo_descendant, mondo):
+ ground_truths = get_ground_truth_from_cache_or_compute(mondo_descendant, mondo, cache)
+ if ground_truth in ground_truths:
  # prediction is a MONDO that maps to a correct OMIM via a descendant
  return PARTIAL_SCORE
  return 0.0
 
+def get_ground_truth_from_cache_or_compute(
+ term, 
+ adapter, 
+ cache,
+):
+ if cache is None:
+ return omim_mappings(term, adapter)
+
+ k = hashkey(term)
+ try:
+ ground_truths = cache[k]
+ cache.hits += 1
+ except KeyError:
+ # cache miss
+ ground_truths = omim_mappings(term, adapter)
+ cache[k] = ground_truths
+ cache.misses += 1
+ return ground_truths
 
diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -13,6 +13,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
  output_dir (Path): Path to the output directory.
  """
 
+ '''
  for lang in langs:
  raw_results_lang = raw_results_dir / "multilingual" / lang
  output_lang = output_dir / "multilingual" / lang
@@ -22,6 +23,8 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
  create_standardised_results(raw_results_dir=raw_results_lang,
  output_dir=output_lang, output_file_name="results.tsv")
 
+ '''
+
  for model in models:
  raw_results_model = raw_results_dir / "multimodel" / model
  output_model = output_dir / "multimodel" / model

diff --git a/src/malco/post_process/compute_mrr.py → src/malco/post_process/ranking_utils.py b/src/malco/post_process/compute_mrr.py → src/malco/post_process/ranking_utils.py
@@ -5,14 +5,23 @@
 import pandas as pd
 import numpy as np
 import pickle as pkl
-from malco.post_process.mondo_score_utils import score_grounded_result
-from malco.post_process.mondo_score_utils import omim_mappings
-from typing import List
+
 from oaklib.interfaces import OboGraphInterface
+from oaklib.datamodels.vocabulary import IS_A
+from oaklib.interfaces import MappingProviderInterface
+from oaklib import get_adapter
 
+from malco.post_process.mondo_score_utils import score_grounded_result
+from cachetools import LRUCache
+from typing import List 
+from cachetools.keys import hashkey
+from shelved_cache import PersistentCache
 
-from oaklib import get_adapter
+FULL_SCORE = 1.0
+PARTIAL_SCORE = 0.5
 
+def cache_info(self):
+ return f"CacheInfo: hits={self.hits}, misses={self.misses}, maxsize={self.wrapped.maxsize}, currsize={self.wrapped.currsize}"
 
 def mondo_adapter() -> OboGraphInterface:
  """
@@ -23,14 +32,27 @@ def mondo_adapter() -> OboGraphInterface:
  """
  return get_adapter("sqlite:obo:mondo") 
 
-def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
- raw_results_dir) -> Path:
+def compute_mrr_and_ranks(
+ comparing, 
+ output_dir, 
+ prompt_dir, 
+ correct_answer_file,
+ raw_results_dir,
+) -> Path:
  # Read in results TSVs from self.output_dir that match glob results*tsv 
  results_data = []
  results_files = []
  num_ppkt = 0
+ pc2_cache_file = str(output_dir / "score_grounded_result_cache")
+ pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=4096) 
+ pc1_cache_file = str(output_dir / "omim_mappings_cache")
+ pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=16384)
+ pc1.hits = pc1.misses = 0
+ pc2.hits = pc2.misses = 0
+ PersistentCache.cache_info = cache_info
+
 
- for subdir, dirs, files in os.walk(output_dir): # maybe change this so it only looks into multilingual/multimodel? I.e. use that as outputdir...?
+ for subdir, dirs, files in os.walk(output_dir):
  for filename in files:
  if filename.startswith("result") and filename.endswith(".tsv"):
  file_path = os.path.join(subdir, filename)
@@ -54,7 +76,7 @@ def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
 
  cache_file = output_dir / "cache_log.txt"
 
- with cache_file.open('w', newline = '') as cf:
+ with cache_file.open('a', newline = '') as cf:
  now_is = datetime.now().strftime("%Y%m%d-%H%M%S")
  cf.write("Timestamp: " + now_is +"\n\n")
  mondo = mondo_adapter()
@@ -74,7 +96,17 @@ def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
  # Make sure caching is used in the following by unwrapping explicitly
  results = []
  for idx, row in df.iterrows():
- val = score_grounded_result(row['term'], row['correct_term'], mondo)
+
+ # lambda prediction, ground_truth, mondo: hashkey(prediction, ground_truth)
+ k = hashkey(row['term'], row['correct_term'])
+ try:
+ val = pc2[k]
+ pc2.hits += 1
+ except KeyError:
+ # cache miss
+ val = score_grounded_result(row['term'], row['correct_term'], mondo, pc1)
+ pc2[k] = val
+ pc2.misses += 1
  is_correct = val > 0
  results.append(is_correct)
 
@@ -97,34 +129,35 @@ def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
 
  ppkts = df.groupby("label")[["rank","is_correct"]] 
  index_matches = df.index[df['is_correct']]
-  
+
  # for each group
  for ppkt in ppkts:
  # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe 
  if not any(ppkt[1]["is_correct"]):
  # no --> increase nf = "not found"
  rank_df.loc[i,"nf"] += 1 
  else:
- # yes --> what's it rank? It's <j>
- jind = ppkt[1].index[ppkt[1]['is_correct']]
- j = int(ppkt[1]['rank'].loc[jind].values[0])
- if j<11:
- # increase n<j>
- rank_df.loc[i,"n"+str(j)] += 1
- else:
- # increase n10p
- rank_df.loc[i,"n10p"] += 1
-
+ # yes --> what's it rank? It's <j>
+ jind = ppkt[1].index[ppkt[1]['is_correct']]
+ j = int(ppkt[1]['rank'].loc[jind].values[0])
+ if j<11:
+ # increase n<j>
+ rank_df.loc[i,"n"+str(j)] += 1
+ else:
+ # increase n10p
+ rank_df.loc[i,"n10p"] += 1
 
  # Write cache charatcteristics to file
  cf.write(results_files[i])
  cf.write('\nscore_grounded_result cache info:\n')
- cf.write(str(score_grounded_result.cache_info()))
+ cf.write(str(pc2.cache_info()))
  cf.write('\nomim_mappings cache info:\n')
- cf.write(str(omim_mappings.cache_info()))
+ cf.write(str(pc1.cache_info()))
  cf.write('\n\n')
  i = i + 1
 
+ pc1.close()
+ pc2.close()
 
  plot_dir = output_dir / "plots"
  plot_dir.mkdir(exist_ok=True)

diff --git a/src/malco/run/run.py b/src/malco/run/run.py
@@ -1,21 +1,33 @@
 from pathlib import Path
 import multiprocessing
 import subprocess
+from malco.run import search_ppkts
 
-
-def call_ontogpt(lang, raw_results_dir, input_dir, model, modality): 
+def call_ontogpt(lang, raw_results_dir, input_dir, model, modality):
+ # TODO
+ # Check what ppkts have already been computed in current output dir, for current run parameters
+ # ontogpt will run every txt that is in inputdir, we need a tmp inputdir
+ # This tmp inputdir contains only the prompts that have not yet been computed for a given, fixed model (pars set)
+ # If it exists and is not empty, create a list of what is in {raw_results_dir}/{lang}/differentials_by_file/
+ # The file names are identical to the prompt file names, with an extra ".result"
+ # Copy all prompt files in the new tmp inputdir, except the ones of line above
+
  if modality=="several_languages":
+ #selected_indir = search_ppkts(input_dir, raw_results_dir, lang)
  command = (
  f"ontogpt -v run-multilingual-analysis "
  f"--output={raw_results_dir}/{lang}/results.yaml " # save raw OntoGPT output
+ #f"{selected_indir} "
  f"{input_dir}/prompts/{lang}/ "
  f"{raw_results_dir}/{lang}/differentials_by_file/ "
  f"--model={model}"
  )
  elif modality=="several_models":
+ #selected_indir = search_ppkts(input_dir, raw_results_dir, model)
  command = (
  f"ontogpt -v run-multilingual-analysis "
  f"--output={raw_results_dir}/{model}/results.yaml " # save raw OntoGPT output
+ #f"{selected_indir} "
  f"{input_dir}/prompts/{lang}/ "
  f"{raw_results_dir}/{model}/differentials_by_file/ "
  f"--model={model}"
@@ -27,7 +39,7 @@ def call_ontogpt(lang, raw_results_dir, input_dir, model, modality):
  process.communicate()
  print(f"Finished command for language {lang} and model {model}") 
 
-
+#TODO decide whether to get rid of parallelization
 def run(testdata_dir: Path,
  raw_results_dir: Path,
  input_dir: Path,
@@ -48,10 +60,12 @@ def run(testdata_dir: Path,
  if max_workers is None:
  max_workers = multiprocessing.cpu_count()
 
+ '''
  modality = "several_languages"
  with multiprocessing.Pool(processes=max_workers) as pool:
  pool.starmap(call_ontogpt, [(lang, raw_results_dir / "multilingual", input_dir, "gpt-4-turbo", modality) for lang in langs])
-
+ '''
+
  # English only many models
  modality = "several_models"
  with multiprocessing.Pool(processes=max_workers) as pool:

diff --git a/src/malco/run/search_ppkts.py b/src/malco/run/search_ppkts.py
@@ -0,0 +1,33 @@
+import os
+
+def search_ppkts(input_dir, raw_results_dir, lang_or_model):
+ original_inputdir = f"{input_dir}/prompts/{lang_or_model}/"
+ diff_dir = f"{raw_results_dir}/{lang_or_model}/differentials_by_file/"
+
+ # files = os.ls(diff_dir)
+ files = []
+ for (dirpath, dirnames, filenames) in os.walk(diff_dir):
+ files.extend(filenames)
+ break
+
+ # if files not exist
+ if files==[]:
+ return original_inputdir
+ else:
+ selected_indir = original_inputdir + "tmp/"
+ os.makedir(selected_indir)
+
+ # prompts = os.ls(original_inputdir)
+ promptfiles = []
+ for (dirpath, dirnames, filenames) in os.walk(original_inputdir):
+ promptfiles.extend(filenames)
+ break
+
+ # foreach promptfile in original_inputdir
+ for promptfile in promptfiles:
+ aux = promptfile + ".result"
+ if aux in files:
+ continue
+ else:
+ os.copy(promptfile, selected_indir)
+ return selected_indir
diff --git a/src/malco/runner.py b/src/malco/runner.py
@@ -3,14 +3,14 @@
 
 from pheval.runners.runner import PhEvalRunner
 
-from malco.post_process.compute_mrr import compute_mrr
+from malco.post_process.ranking_utils import compute_mrr_and_ranks
 from malco.post_process.post_process import post_process
 from malco.run.run import run
 from malco.prepare.setup_phenopackets import setup_phenopackets
 from malco.post_process.generate_plots import make_plots
 import os
 
-@dataclass
+@dataclass # (N) if PhevalRunner is already one?
 class MalcoRunner(PhEvalRunner):
  input_dir: Path
  testdata_dir: Path
@@ -21,7 +21,8 @@ class MalcoRunner(PhEvalRunner):
  # Declare a tuple of languages and models
  #TODO move next 4 lines to input file
  languages = ("en", "es", "nl", "it", "de")
- models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), 
+ #models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), 
+ models = ("gpt-3.5-turbo", "gpt-4-turbo") # Decide on list of models: Claude-Sonnet (Anthropic key), 
  just_run = 1 # only run the run part of the code
  just_postprocess = 0 # only run the postprocess part of the code
 
@@ -61,9 +62,9 @@ def post_process(self,
  langs=self.languages,
  models=self.models)
 
-
+ '''
  comparing = "language"
- mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
+ mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr_and_ranks(comparing,
  output_dir=self.output_dir / "multilingual" ,
  prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
  correct_answer_file=correct_answer_file,
@@ -72,9 +73,9 @@ def post_process(self,
  if print_plot:
  make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)
  
-
+ '''
  comparing = "model"
- mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
+ mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr_and_ranks(comparing,
  output_dir=self.output_dir / "multimodel" ,
  prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
  correct_answer_file=correct_answer_file,