Moved cache files, tested it, minor cleanups

monarch-initiative · Sep 4, 2024 · 2913dc7 · 2913dc7
1 parent 070f74c
commit 2913dc7
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 21 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ packages = [{include = "malco", from = "src"}]
 python = "^3.10"
 pheval = "^0.3.2"
 setuptools = "^69.5.1"
+shelved-cache = "^0.3.1"
 
 
 [tool.poetry.plugins."pheval.plugins"]
@@ -20,7 +21,7 @@ pytest = "^7.1.2"
 pylint = "^2.15.6"
 pycodestyle = "^2.10.0"
 coverage = "^6.5.0"
-ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", branch = "main"}
+ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", tag = "v1.0.3"}
 
 [tool.poetry.group.dev.dependencies]
 tox = "^4.15.0"

diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py
@@ -35,19 +35,24 @@ def mondo_adapter() -> OboGraphInterface:
  return get_adapter("sqlite:obo:mondo") 
 
 def compute_mrr_and_ranks(
- comparing, 
- output_dir, 
- prompt_dir, 
- correct_answer_file,
+ comparing: str, 
+ output_dir: Path, 
+ out_subdir: str,
+ prompt_dir: str, 
+ correct_answer_file: str,
  ) -> Path:
+
  # Read in results TSVs from self.output_dir that match glob results*tsv 
+ out_caches = output_dir / "caches"
+ out_caches.mkdir(exist_ok=True)
+ output_dir = output_dir / out_subdir
  results_data = []
  results_files = []
  num_ppkt = 0
- pc2_cache_file = str(output_dir / "score_grounded_result_cache")
- pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=4096) 
- pc1_cache_file = str(output_dir / "omim_mappings_cache")
- pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=16384)
+ pc2_cache_file = str(out_caches / "score_grounded_result_cache")
+ pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=524288) 
+ pc1_cache_file = str(out_caches / "omim_mappings_cache")
+ pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=524288)
  # Treat hits and misses as run-specific arguments, write them cache_log
  pc1.hits = pc1.misses = 0
  pc2.hits = pc2.misses = 0
@@ -78,7 +83,7 @@ def compute_mrr_and_ranks(
  header = [comparing, "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
  rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header)
 
- cache_file = output_dir / "cache_log.txt"
+ cache_file = out_caches / "cache_log.txt"
 
  with cache_file.open('a', newline = '') as cf:
  now_is = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -120,7 +125,7 @@ def compute_mrr_and_ranks(
  # Save full data frame
  full_df_path = output_dir / results_files[i].split("/")[0]
  full_df_filename = "full_df_results.tsv"
- safe_save_tsv(full_df_path, df, full_df_filename)
+ safe_save_tsv(full_df_path, full_df_filename, df)
 
  # Calculate MRR for this file
  mrr = df.groupby("label")["reciprocal_rank"].max().mean()

diff --git a/src/malco/runner.py b/src/malco/runner.py
@@ -23,14 +23,7 @@ class MalcoRunner(PhEvalRunner):
  #languages: tuple
  #models: tuple
  #just_run: bool
- #just_postprocess: bool
-
- #languages = ("en", "es", "nl", "it", "de")
- #models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), 
- #models = ("gpt-3.5-turbo", "gpt-4-turbo") # Decide on list of models: Claude-Sonnet (Anthropic key), 
- #just_run = 0 # only run the run part of the code
- #just_postprocess = 1 # only run the postprocess part of the code
-
+ #just_postprocess: bool 
 
 
  def prepare(self):
@@ -75,7 +68,8 @@ def post_process(self,
  '''
  comparing = "language"
  mrr_file, plot_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing,
- output_dir=self.output_dir / "multilingual" ,
+ output_dir=self.output_dir,
+ out_subdir="multilingual",
  prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
  correct_answer_file=correct_answer_file)
  
@@ -85,7 +79,8 @@ def post_process(self,
  '''
  comparing = "model"
  mrr_file, data_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing,
- output_dir=self.output_dir / "multimodel" ,
+ output_dir=self.output_dir,
+ out_subdir="multimodel",
  prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
  correct_answer_file=correct_answer_file)