forked from predlico/ARAGOG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
99 lines (84 loc) · 3.87 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
import time
import pandas as pd
import time
def remove_nul_chars_from_string(s):
"""Remove NUL characters from a single string."""
return s.replace('\x00', '')
def remove_nul_chars_from_run_data(run_data):
"""Iterate over all fields of RunData to remove NUL characters."""
for run in run_data:
run.reference_question = remove_nul_chars_from_string(run.reference_question)
run.reference_answer = remove_nul_chars_from_string(run.reference_answer)
run.llm_answer = remove_nul_chars_from_string(run.llm_answer)
run.llm_context = [remove_nul_chars_from_string(context) for context in run.llm_context]
def make_get_llama_response(query_engine):
def get_llama_response(prompt):
# print(prompt)
response = query_engine.query(prompt)
context = []
for x in response.source_nodes:
# Initialize context string with the text of the node
node_context = x.text
# Check if 'window' metadata exists and append it to the context
if 'window' in x.metadata:
node_context += "\n\nWindow Context:\n" + x.metadata['window']
context.append(node_context)
return {
"llm_answer": response.response,
"llm_context_list": context
}
return get_llama_response
def chunked_iterable(iterable, size):
"""Yield successive size chunks from iterable."""
for i in range(0, len(iterable), size):
yield iterable[i:i + size]
# Function to load and validate configuration settings
def load_config(config_path):
with open(config_path) as config_file:
config = json.load(config_file)
# Example validation, extend as needed
required_keys = ['openai_api_key', 'qdrant_url', 'qdrant_api_key']
if not all(key in config for key in required_keys):
raise ValueError("Config file is missing required keys.")
return config
def run_experiment(experiment_name, query_engine, scorer, benchmark, validate_api, project_key, upload_results=True, runs=5):
# List to store results dictionaries
results_list = []
for i in range(runs):
get_llama_response_func = make_get_llama_response(query_engine)
run = scorer.score(benchmark,
get_llama_response_func,
callback_parallelism=1,
scoring_parallelism=1)
print(f"{experiment_name} Run {i+1} Overall Scores:", run.overall_scores)
remove_nul_chars_from_run_data(run.run_data)
# Add this run's results to the list
results_list.append({'Run': i+1, 'Experiment': experiment_name, 'OverallScores': run.overall_scores})
if upload_results:
validate_api.upload_run(project_key, run=run, run_metadata={"approach": experiment_name, "run_number": i+1})
else:
print(f"Skipping upload for {experiment_name} Run {i+1}.")
# Create a DataFrame from the list of results dictionaries
results_df = pd.DataFrame(results_list)
# Return the DataFrame containing all the results
return results_df
def filter_large_nodes(nodes, max_length=8000):
"""
Filters out nodes with 'window' or 'text' length greater than max_length.
Needed bcs sometimes the sentences are too long due to tables or refereneces in data.
It creates one giga long non-sensical sentence. Before filtering please do analysis
so that you dont throw out anything important.
Args:
- nodes (list): List of node objects.
- max_length (int): Maximum allowed length for 'window' and 'text'.
Returns:
- list: Filtered list of nodes.
"""
filtered_nodes = []
for node in nodes:
text_length = len(node.text)
window_length = len(node.metadata.get('window', ''))
if text_length <= max_length and window_length <= max_length:
filtered_nodes.append(node)
return filtered_nodes