Skip to content

Commit

Permalink
Fixed formatting issues and updated tests
Browse files Browse the repository at this point in the history
  • Loading branch information
chandralegend committed Apr 16, 2024
1 parent 7084734 commit 9841a97
Show file tree
Hide file tree
Showing 16 changed files with 49 additions and 49 deletions.
2 changes: 1 addition & 1 deletion scripts/backup.jac
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ can zipdir(folders: list[str], out: str) {
}

can upload_file_to_drive(file: str, folder_id: str) {
file_drive = drive.CreateFile({'parents':[{'id':folder_id }] });
file_drive = drive.CreateFile({'parents': [{'id': folder_id}]});
file_drive.SetContentFile(file);
file_drive.Upload();
}
Expand Down
2 changes: 1 addition & 1 deletion scripts/runs2data.jac
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import:py argparse;

can convert_run(run: str, prompt_disc: str) {
responses_files = [f for f in os.listdir(os.path.join("runs", run)) if f.endswith(".json")];
data = {"run":run, "prompt_disc":prompt_disc, "outputs":{} };
data = {"run": run, "prompt_disc": prompt_disc, "outputs": {}};

for responses_file in responses_files {
with open(os.path.join("runs", run, responses_file), "r") as f {
Expand Down
4 changes: 2 additions & 2 deletions src/components/emb_sim_scorer.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ import:py from nltk, ngrams;
results = [];
id = 0;
for (prompt_id, question_sets) in distribution.items() {
worker_set = {"worker_id":id, "question_set_id":list(distribution.keys())[id], "evals":[], "start_time":time.time(), "end_time":0, "question_index":0 };
worker_set = {"worker_id": id, "question_set_id": list(distribution.keys())[id], "evals": [], "start_time": time.time(), "end_time": 0, "question_index": 0};
evals = [];
for (ind, question_set) in enumerate(question_sets) {
prompt_id = question_set[0];
responses = question_set[1];
anchor_reponses_id = models_responses_dict[prompt_id][st.session_state.anchor_model];
best_model = calculate_embedding_score(responses=responses, anchor_reponses_id=anchor_reponses_id, responses_dict=responses_dict);
worker_set["evals"].append({"result":{"overall":"Response A" if best_model == 0 else "Response B", 'feedback':"" }, "time":time.time(), "question":question_set });
worker_set["evals"].append({"result": {"overall": "Response A" if best_model == 0 else "Response B", 'feedback': ""}, "time": time.time(), "question": question_set});
worker_set["question_index"]+=1;
}
worker_set["end_time"] = time.time();
Expand Down
2 changes: 1 addition & 1 deletion src/components/generator.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ import:jac from utils, load_engine, run_inference, llms, check_query_engine, che
prompt_template = prompt_template_col.text_area('Input Prompt Template', placeholder='Paste your template here', height=250);
with prompt_values_col {
st.caption('Input Prompt Values');
arguments = {x:None for x in re.findall(r'\{([A-Za-z0-9_]+)\}', prompt_template)};
arguments = {x: None for x in re.findall(r'\{([A-Za-z0-9_]+)\}', prompt_template)};
print(arguments);
if len(arguments) == 0 {
st.info('No arguments found in the prompt template');
Expand Down
8 changes: 4 additions & 4 deletions src/components/human_eval.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ import:py string;
}
if st.session_state.hv_config["hv_method"] == "A/B Testing" {
st.markdown("You are given two responses below, Select which one you prefer.");
result = {"overall":None, "feedback":None };
result = {"overall": None, "feedback": None};
(col_a, col_result, col_b) = st.columns(3);
with col_result {
st.caption("Questions");
Expand All @@ -96,7 +96,7 @@ import:py string;
}
if st.session_state.hv_config["hv_method"] == "A/B Testing with Criterions" {
st.caption("Questions");
result = {x["Criteria"]:None for x in st.session_state.hv_config["criterias"]};
result = {x["Criteria"]: None for x in st.session_state.hv_config["criterias"]};
result["feedback"] = None;
n_criterias = len(st.session_state.hv_config["criterias"]);
for i in range(int(n_criterias / 4) + 1) {
Expand Down Expand Up @@ -129,10 +129,10 @@ import:py string;
}
if st.button("Next Question", <>type="primary", key=f"next_{question_index}") {
if all(result.values()) {
st.session_state.evals.append({"result":result, "time":time.time(), "question":question });
st.session_state.evals.append({"result": result, "time": time.time(), "question": question});
st.session_state.question_index+=1;
with open(os.path.join("results", f"{st.session_state.worker_id}_{st.session_state.question_set_id}.json"), "w") as f {
json.dump({"worker_id":st.session_state.worker_id, "question_set_id":st.session_state.question_set_id, "evals":st.session_state.evals, "start_time":st.session_state.start_time, "end_time":time.time(), "question_index":st.session_state.question_index }, f, indent=2);
json.dump({"worker_id": st.session_state.worker_id, "question_set_id": st.session_state.question_set_id, "evals": st.session_state.evals, "start_time": st.session_state.start_time, "end_time": time.time(), "question_index": st.session_state.question_index}, f, indent=2);
}
st.components.v1.html("<script>window.parent.document.querySelector('section.main').scrollTo(-1, -1);</script>", height=0);
time.sleep(0.5);
Expand Down
10 changes: 5 additions & 5 deletions src/components/llm_as_evaluator.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ import:jac from utils, llms, check_engine_status, load_engine, run_inference;
(model_b, model_b_respones_id) = list(question[1].items())[1];
model_a_respones = st.session_state.responses[model_a_respones_id];
model_b_respones = st.session_state.responses[model_b_respones_id];
arguments = {"usecase":prompt_disc, "response_a":model_a_respones, "response_b":model_b_respones };
arguments = {"usecase": prompt_disc, "response_a": model_a_respones, "response_b": model_b_respones};
try {
inference_result = run_inference(model_name, 1, arguments);
question_result = process_inference_result(inference_result['outputs'][0]['response']);
evals.append({"result":question_result, "time":time.time(), "question":question, "inference_result":inference_result });
evals.append({"result": question_result, "time": time.time(), "question": question, "inference_result": inference_result});
} except Exception as e {
evals.append({"result":str(e), "time":time.time(), "question":question });
evals.append({"result": str(e), "time": time.time(), "question": question});
}
}
with open(os.path.join("results", f"llm_as_evaluator_{question_set_id}.json"), "w") as f {
json.dump({"worker_id":"llm_as_evaluator", "question_set_id":question_set_id, "evals":evals, "start_time":time.time(), "end_time":time.time() }, f, indent=2);
json.dump({"worker_id": "llm_as_evaluator", "question_set_id": question_set_id, "evals": evals, "start_time": time.time(), "end_time": time.time()}, f, indent=2);
}
}
} else {
Expand All @@ -90,7 +90,7 @@ import:jac from utils, llms, check_engine_status, load_engine, run_inference;
answers = eval(re.search(r"\[Output\](.*)", inference_result, re.DOTALL).group(1).strip());
answers["feedback"] = re.search(r"\[Reasoning\](.*)\[Output\]", inference_result, re.DOTALL).group(1).strip();
} except Exception as e {
answers = {"error":str(e) };
answers = {"error": str(e)};
}
return answers;
}
4 changes: 2 additions & 2 deletions src/components/model_ranking.jac
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import:jac from utils, map_prompt_names_to_ids, format_responses_by_prompt;
can model_win_percentage_table {
if st.session_state.get("current_hv_config", None) {
criteria = ["overall"];
model_performance = {model:{"wins":0, "total":0 } for model in st.session_state.active_list_of_models};
model_performance = {model: {"wins": 0, "total": 0} for model in st.session_state.active_list_of_models};
formatted_data = format_responses_by_prompt(st.session_state.workers_data_dir, st.session_state.distribution_file, st.session_state.response_file);
prompt_info = map_prompt_names_to_ids(st.session_state.prompt_data_dir, st.session_state.prompt_info_file);
prompt_ids = list(prompt_info.keys());
Expand Down Expand Up @@ -34,7 +34,7 @@ can model_win_percentage_table {
data = [];
for (model, counts) in model_performance.items() {
win_percentage = (counts["wins"] / counts["total"]) if counts["total"] else 0;
data.append({"Model":model, "Win Percentage":win_percentage });
data.append({"Model": model, "Win Percentage": win_percentage});
}
df = pd.DataFrame(data);
df = df.sort_values(by='Win Percentage', ascending=False);
Expand Down
6 changes: 3 additions & 3 deletions src/components/plot_utils.jac
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {

for (model, crits) in model_performance.items() {
for (crit, counts) in crits.items() {
df_data.append({"model":model, "criterion":crit, "wins":counts["wins"], "ties":counts["ties"], "losses":counts["losses"] });
df_data.append({"model": model, "criterion": crit, "wins": counts["wins"], "ties": counts["ties"], "losses": counts["losses"]});
}
}
df = pd.DataFrame(df_data);
Expand All @@ -22,7 +22,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {
fig_height_per_row = 220;
}
total_fig_height = fig_height_per_row * len(criteria);
colors = {"wins":"green", "ties":"orange", "losses":"red" };
colors = {"wins": "green", "ties": "orange", "losses": "red"};

for (i, criterion) in enumerate(criteria) {
criterion_data = df[df["criterion"] == criterion].sort_values("wins", ascending=False);
Expand All @@ -43,7 +43,7 @@ can generate_heatmaps(placeholder: str, model_performance: dict, preference_matr

for (model, crit_dict) in model_performance.items() {
for (crit, win_tie_loss) in crit_dict.items() {
rows_list.append({"model":model, "criterion":crit, "wins":win_tie_loss['wins'], "ties":win_tie_loss['ties'], "losses":win_tie_loss['losses'] });
rows_list.append({"model": model, "criterion": crit, "wins": win_tie_loss['wins'], "ties": win_tie_loss['ties'], "losses": win_tie_loss['losses']});
}
}
df = pd.DataFrame(rows_list);
Expand Down
14 changes: 7 additions & 7 deletions src/components/setup.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ import:py pandas as pd;
if os.path.exists(os.path.join(".human_eval_config", "config.json")) {
config = json.load(open(os.path.join(".human_eval_config", "config.json")));
} else {
config = {"hv_method":"A/B Testing", "config":{"n_workers":100, "n_questions_per_worker":10, "n_options":2, "ability_to_tie":"Allow", "data_sources":None, "evenly_distributed":False, "show_captcha":True, "completion_code":str(uuid.uuid4()) } };
config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "n_options": 2, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
}
st.session_state.config = config;
}
}

:can:hv_evaluation_method_selector {
eval_methods = {"A/B Testing":{"desc":"A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image":"https://i.ibb.co/kXwypbc/image.png" }, "A/B Testing with Criterions":{"desc":"A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image":"https://i.ibb.co/JF6Q3F3/image.png" } };
eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
criterias = None;
st.subheader("Human Evaluation Method");
(hv_method_col, hv_method_view_col) = st.columns(2);
with hv_method_col {
hv_method = st.selectbox("Select the human evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
st.caption(eval_methods[hv_method]["desc"]);
if hv_method == "A/B Testing with Criterions" {
criteria_df = pd.DataFrame([{"Criteria":"clarity", "Description":"Which has better clarity?" }, {"Criteria":"intelligence", "Description":"Which is more intelligent?" }, {"Criteria":"likability", "Description":"Which is more likable?" }, {"Criteria":"trustworthy", "Description":"Which is more trustworthy?" }, {"Criteria":"overall", "Description":"Which is better overall?" }]);
criteria_df = pd.DataFrame([{"Criteria": "clarity", "Description": "Which has better clarity?"}, {"Criteria": "intelligence", "Description": "Which is more intelligent?"}, {"Criteria": "likability", "Description": "Which is more likable?"}, {"Criteria": "trustworthy", "Description": "Which is more trustworthy?"}, {"Criteria": "overall", "Description": "Which is better overall?"}]);
edited_df = st.data_editor(criteria_df, num_rows="dynamic");
criterias = edited_df.to_dict(orient="records");
}
Expand Down Expand Up @@ -116,7 +116,7 @@ import:py pandas as pd;
unique_question_pairs = list(itertools.product(model_a_responses, model_b_responses));
unique_question_pairs = random.sample(unique_question_pairs, k);
for question_pair in unique_question_pairs {
question_pairs.append((prompt_id, {model_pair[0]:question_pair[0], model_pair[1]:question_pair[1] }));
question_pairs.append((prompt_id, {model_pair[0]: question_pair[0], model_pair[1]: question_pair[1]}));
}
}
}
Expand All @@ -133,12 +133,12 @@ import:py pandas as pd;
selected_question_pairs = random.sample(question_pairs, n_workers * n_questions_per_worker);
random.shuffle(selected_question_pairs);
worker_ids = [str(uuid.uuid4()) for _ in range(n_workers)];
distribution = {worker_id:selected_question_pairs[i * n_questions_per_worker:(i + 1) * n_questions_per_worker] for (i, worker_id) in enumerate(worker_ids)};
distribution = {worker_id: selected_question_pairs[i * n_questions_per_worker:(i + 1) * n_questions_per_worker] for (i, worker_id) in enumerate(worker_ids)};
} else {
question_pairs_copy = question_pairs.copy();
random.shuffle(question_pairs_copy);
worker_ids = [str(uuid.uuid4()) for _ in range(n_workers)];
distribution = {worker_id:[] for worker_id in worker_ids};
distribution = {worker_id: [] for worker_id in worker_ids};
prompt_infos = json.load(open(os.path.join(".human_eval_config", "prompt_info.json")));
prompt_ids = [prompt_info["prompt_id"] for prompt_info in prompt_infos];
questions_per_usecase_per_worker = int(n_questions_per_worker / len(prompt_ids));
Expand Down Expand Up @@ -201,7 +201,7 @@ import:py pandas as pd;
:can:get_prompt_info
(data_sources: list) {
st.subheader("Prompt Information");
prompt_infos = {data_source:{} for data_source in data_sources};
prompt_infos = {data_source: {} for data_source in data_sources};
with st.container(border=True) {
for data_source in data_sources {
with st.expander(f"Usecase {data_source}") {
Expand Down
2 changes: 1 addition & 1 deletion src/components/setup.jac
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ can setup {
prompt_infos = get_prompt_info(data_sources);
if st.button("Save") {
os.makedirs(".human_eval_config", exist_ok=True);
config = {"hv_method":hv_method, "config":{"n_workers":n_workers, "n_questions_per_worker":n_questions_per_worker, "n_options":n_options, "ability_to_tie":ability_to_tie, "data_sources":data_sources, "evenly_distributed":evenly_distributed, "show_captcha":show_captcha, "completion_code":str(uuid.uuid4()) }, "criterias":criterias, "time_created":datetime.now().strftime("%d/%m/%Y %H:%M:%S") };
config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "n_options": n_options, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
st.session_state.config = config;
json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
models_responses_all = create_neccessary_file(data_sources, prompt_infos);
Expand Down
Loading

0 comments on commit 9841a97

Please sign in to comment.