Fixed formatting issues and updated tests

Jaseci-Labs · Apr 16, 2024 · 9841a97 · 9841a97
1 parent 7084734
commit 9841a97
Show file tree

Hide file tree

Showing 16 changed files with 49 additions and 49 deletions.
diff --git a/scripts/backup.jac b/scripts/backup.jac
@@ -21,7 +21,7 @@ can zipdir(folders: list[str], out: str) {
 }
 
 can upload_file_to_drive(file: str, folder_id: str) {
-    file_drive = drive.CreateFile({'parents':[{'id':folder_id }] });
+    file_drive = drive.CreateFile({'parents': [{'id': folder_id}]});
     file_drive.SetContentFile(file);
     file_drive.Upload();
 }

diff --git a/scripts/runs2data.jac b/scripts/runs2data.jac
@@ -4,7 +4,7 @@ import:py argparse;
 
 can convert_run(run: str, prompt_disc: str) {
     responses_files = [f  for f in os.listdir(os.path.join("runs", run)) if f.endswith(".json")];
-    data = {"run":run, "prompt_disc":prompt_disc, "outputs":{} };
+    data = {"run": run, "prompt_disc": prompt_disc, "outputs": {}};
 
     for responses_file in responses_files {
         with open(os.path.join("runs", run, responses_file), "r") as f {

diff --git a/src/components/emb_sim_scorer.impl.jac b/src/components/emb_sim_scorer.impl.jac
@@ -83,14 +83,14 @@ import:py from nltk, ngrams;
         results = [];
         id = 0;
         for (prompt_id, question_sets) in distribution.items() {
-            worker_set = {"worker_id":id, "question_set_id":list(distribution.keys())[id], "evals":[], "start_time":time.time(), "end_time":0, "question_index":0 };
+            worker_set = {"worker_id": id, "question_set_id": list(distribution.keys())[id], "evals": [], "start_time": time.time(), "end_time": 0, "question_index": 0};
             evals = [];
             for (ind, question_set) in enumerate(question_sets) {
                 prompt_id = question_set[0];
                 responses = question_set[1];
                 anchor_reponses_id = models_responses_dict[prompt_id][st.session_state.anchor_model];
                 best_model = calculate_embedding_score(responses=responses, anchor_reponses_id=anchor_reponses_id, responses_dict=responses_dict);
-                worker_set["evals"].append({"result":{"overall":"Response A" if best_model == 0 else "Response B", 'feedback':"" }, "time":time.time(), "question":question_set });
+                worker_set["evals"].append({"result": {"overall": "Response A" if best_model == 0 else "Response B", 'feedback': ""}, "time": time.time(), "question": question_set});
                 worker_set["question_index"]+=1;
             }
             worker_set["end_time"] = time.time();

diff --git a/src/components/generator.impl.jac b/src/components/generator.impl.jac
@@ -49,7 +49,7 @@ import:jac from utils, load_engine, run_inference, llms, check_query_engine, che
     prompt_template = prompt_template_col.text_area('Input Prompt Template', placeholder='Paste your template here', height=250);
     with prompt_values_col {
         st.caption('Input Prompt Values');
-        arguments = {x:None  for x in re.findall(r'\{([A-Za-z0-9_]+)\}', prompt_template)};
+        arguments = {x: None  for x in re.findall(r'\{([A-Za-z0-9_]+)\}', prompt_template)};
         print(arguments);
         if len(arguments) == 0 {
             st.info('No arguments found in the prompt template');

diff --git a/src/components/human_eval.impl.jac b/src/components/human_eval.impl.jac
@@ -81,7 +81,7 @@ import:py string;
     }
     if st.session_state.hv_config["hv_method"] == "A/B Testing" {
         st.markdown("You are given two responses below, Select which one you prefer.");
-        result = {"overall":None, "feedback":None };
+        result = {"overall": None, "feedback": None};
         (col_a, col_result, col_b) = st.columns(3);
         with col_result {
             st.caption("Questions");
@@ -96,7 +96,7 @@ import:py string;
     }
     if st.session_state.hv_config["hv_method"] == "A/B Testing with Criterions" {
         st.caption("Questions");
-        result = {x["Criteria"]:None  for x in st.session_state.hv_config["criterias"]};
+        result = {x["Criteria"]: None  for x in st.session_state.hv_config["criterias"]};
         result["feedback"] = None;
         n_criterias = len(st.session_state.hv_config["criterias"]);
         for i in range(int(n_criterias / 4) + 1) {
@@ -129,10 +129,10 @@ import:py string;
     }
     if st.button("Next Question", <>type="primary", key=f"next_{question_index}") {
         if all(result.values()) {
-            st.session_state.evals.append({"result":result, "time":time.time(), "question":question });
+            st.session_state.evals.append({"result": result, "time": time.time(), "question": question});
             st.session_state.question_index+=1;
             with open(os.path.join("results", f"{st.session_state.worker_id}_{st.session_state.question_set_id}.json"), "w") as f {
-                json.dump({"worker_id":st.session_state.worker_id, "question_set_id":st.session_state.question_set_id, "evals":st.session_state.evals, "start_time":st.session_state.start_time, "end_time":time.time(), "question_index":st.session_state.question_index }, f, indent=2);
+                json.dump({"worker_id": st.session_state.worker_id, "question_set_id": st.session_state.question_set_id, "evals": st.session_state.evals, "start_time": st.session_state.start_time, "end_time": time.time(), "question_index": st.session_state.question_index}, f, indent=2);
             }
             st.components.v1.html("<script>window.parent.document.querySelector('section.main').scrollTo(-1, -1);</script>", height=0);
             time.sleep(0.5);

diff --git a/src/components/llm_as_evaluator.impl.jac b/src/components/llm_as_evaluator.impl.jac
@@ -64,17 +64,17 @@ import:jac from utils, llms, check_engine_status, load_engine, run_inference;
                     (model_b, model_b_respones_id) = list(question[1].items())[1];
                     model_a_respones = st.session_state.responses[model_a_respones_id];
                     model_b_respones = st.session_state.responses[model_b_respones_id];
-                    arguments = {"usecase":prompt_disc, "response_a":model_a_respones, "response_b":model_b_respones };
+                    arguments = {"usecase": prompt_disc, "response_a": model_a_respones, "response_b": model_b_respones};
                     try  {
                         inference_result = run_inference(model_name, 1, arguments);
                         question_result = process_inference_result(inference_result['outputs'][0]['response']);
-                        evals.append({"result":question_result, "time":time.time(), "question":question, "inference_result":inference_result });
+                        evals.append({"result": question_result, "time": time.time(), "question": question, "inference_result": inference_result});
                     } except Exception as e  {
-                        evals.append({"result":str(e), "time":time.time(), "question":question });
+                        evals.append({"result": str(e), "time": time.time(), "question": question});
                     }
                 }
                 with open(os.path.join("results", f"llm_as_evaluator_{question_set_id}.json"), "w") as f {
-                    json.dump({"worker_id":"llm_as_evaluator", "question_set_id":question_set_id, "evals":evals, "start_time":time.time(), "end_time":time.time() }, f, indent=2);
+                    json.dump({"worker_id": "llm_as_evaluator", "question_set_id": question_set_id, "evals": evals, "start_time": time.time(), "end_time": time.time()}, f, indent=2);
                 }
             }
         } else {
@@ -90,7 +90,7 @@ import:jac from utils, llms, check_engine_status, load_engine, run_inference;
         answers = eval(re.search(r"\[Output\](.*)", inference_result, re.DOTALL).group(1).strip());
         answers["feedback"] = re.search(r"\[Reasoning\](.*)\[Output\]", inference_result, re.DOTALL).group(1).strip();
     } except Exception as e  {
-        answers = {"error":str(e) };
+        answers = {"error": str(e)};
     }
     return answers;
 }
diff --git a/src/components/model_ranking.jac b/src/components/model_ranking.jac
@@ -5,7 +5,7 @@ import:jac from utils, map_prompt_names_to_ids, format_responses_by_prompt;
 can model_win_percentage_table {
     if st.session_state.get("current_hv_config", None) {
         criteria = ["overall"];
-        model_performance = {model:{"wins":0, "total":0 }  for model in st.session_state.active_list_of_models};
+        model_performance = {model: {"wins": 0, "total": 0}  for model in st.session_state.active_list_of_models};
         formatted_data = format_responses_by_prompt(st.session_state.workers_data_dir, st.session_state.distribution_file, st.session_state.response_file);
         prompt_info = map_prompt_names_to_ids(st.session_state.prompt_data_dir, st.session_state.prompt_info_file);
         prompt_ids = list(prompt_info.keys());
@@ -34,7 +34,7 @@ can model_win_percentage_table {
         data = [];
         for (model, counts) in model_performance.items() {
             win_percentage = (counts["wins"] / counts["total"]) if counts["total"] else 0;
-            data.append({"Model":model, "Win Percentage":win_percentage });
+            data.append({"Model": model, "Win Percentage": win_percentage});
         }
         df = pd.DataFrame(data);
         df = df.sort_values(by='Win Percentage', ascending=False);

diff --git a/src/components/plot_utils.jac b/src/components/plot_utils.jac
@@ -11,7 +11,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {
 
     for (model, crits) in model_performance.items() {
         for (crit, counts) in crits.items() {
-            df_data.append({"model":model, "criterion":crit, "wins":counts["wins"], "ties":counts["ties"], "losses":counts["losses"] });
+            df_data.append({"model": model, "criterion": crit, "wins": counts["wins"], "ties": counts["ties"], "losses": counts["losses"]});
         }
     }
     df = pd.DataFrame(df_data);
@@ -22,7 +22,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {
         fig_height_per_row = 220;
     }
     total_fig_height = fig_height_per_row * len(criteria);
-    colors = {"wins":"green", "ties":"orange", "losses":"red" };
+    colors = {"wins": "green", "ties": "orange", "losses": "red"};
 
     for (i, criterion) in enumerate(criteria) {
         criterion_data = df[df["criterion"] == criterion].sort_values("wins", ascending=False);
@@ -43,7 +43,7 @@ can generate_heatmaps(placeholder: str, model_performance: dict, preference_matr
 
     for (model, crit_dict) in model_performance.items() {
         for (crit, win_tie_loss) in crit_dict.items() {
-            rows_list.append({"model":model, "criterion":crit, "wins":win_tie_loss['wins'], "ties":win_tie_loss['ties'], "losses":win_tie_loss['losses'] });
+            rows_list.append({"model": model, "criterion": crit, "wins": win_tie_loss['wins'], "ties": win_tie_loss['ties'], "losses": win_tie_loss['losses']});
         }
     }
     df = pd.DataFrame(rows_list);

diff --git a/src/components/setup.impl.jac b/src/components/setup.impl.jac
@@ -12,22 +12,22 @@ import:py pandas as pd;
         if os.path.exists(os.path.join(".human_eval_config", "config.json")) {
             config = json.load(open(os.path.join(".human_eval_config", "config.json")));
         } else {
-            config = {"hv_method":"A/B Testing", "config":{"n_workers":100, "n_questions_per_worker":10, "n_options":2, "ability_to_tie":"Allow", "data_sources":None, "evenly_distributed":False, "show_captcha":True, "completion_code":str(uuid.uuid4()) } };
+            config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "n_options": 2, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
         }
         st.session_state.config = config;
     }
 }
 
 :can:hv_evaluation_method_selector {
-    eval_methods = {"A/B Testing":{"desc":"A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image":"https://i.ibb.co/kXwypbc/image.png" }, "A/B Testing with Criterions":{"desc":"A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image":"https://i.ibb.co/JF6Q3F3/image.png" } };
+    eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
     criterias = None;
     st.subheader("Human Evaluation Method");
     (hv_method_col, hv_method_view_col) = st.columns(2);
     with hv_method_col {
         hv_method = st.selectbox("Select the human evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
         st.caption(eval_methods[hv_method]["desc"]);
         if hv_method == "A/B Testing with Criterions" {
-            criteria_df = pd.DataFrame([{"Criteria":"clarity", "Description":"Which has better clarity?" }, {"Criteria":"intelligence", "Description":"Which is more intelligent?" }, {"Criteria":"likability", "Description":"Which is more likable?" }, {"Criteria":"trustworthy", "Description":"Which is more trustworthy?" }, {"Criteria":"overall", "Description":"Which is better overall?" }]);
+            criteria_df = pd.DataFrame([{"Criteria": "clarity", "Description": "Which has better clarity?"}, {"Criteria": "intelligence", "Description": "Which is more intelligent?"}, {"Criteria": "likability", "Description": "Which is more likable?"}, {"Criteria": "trustworthy", "Description": "Which is more trustworthy?"}, {"Criteria": "overall", "Description": "Which is better overall?"}]);
             edited_df = st.data_editor(criteria_df, num_rows="dynamic");
             criterias = edited_df.to_dict(orient="records");
         }
@@ -116,7 +116,7 @@ import:py pandas as pd;
             unique_question_pairs = list(itertools.product(model_a_responses, model_b_responses));
             unique_question_pairs = random.sample(unique_question_pairs, k);
             for question_pair in unique_question_pairs {
-                question_pairs.append((prompt_id, {model_pair[0]:question_pair[0], model_pair[1]:question_pair[1] }));
+                question_pairs.append((prompt_id, {model_pair[0]: question_pair[0], model_pair[1]: question_pair[1]}));
             }
         }
     }
@@ -133,12 +133,12 @@ import:py pandas as pd;
         selected_question_pairs = random.sample(question_pairs, n_workers * n_questions_per_worker);
         random.shuffle(selected_question_pairs);
         worker_ids = [str(uuid.uuid4())  for _ in range(n_workers)];
-        distribution = {worker_id:selected_question_pairs[i * n_questions_per_worker:(i + 1) * n_questions_per_worker]  for (i, worker_id) in enumerate(worker_ids)};
+        distribution = {worker_id: selected_question_pairs[i * n_questions_per_worker:(i + 1) * n_questions_per_worker]  for (i, worker_id) in enumerate(worker_ids)};
     } else {
         question_pairs_copy = question_pairs.copy();
         random.shuffle(question_pairs_copy);
         worker_ids = [str(uuid.uuid4())  for _ in range(n_workers)];
-        distribution = {worker_id:[]  for worker_id in worker_ids};
+        distribution = {worker_id: []  for worker_id in worker_ids};
         prompt_infos = json.load(open(os.path.join(".human_eval_config", "prompt_info.json")));
         prompt_ids = [prompt_info["prompt_id"]  for prompt_info in prompt_infos];
         questions_per_usecase_per_worker = int(n_questions_per_worker / len(prompt_ids));
@@ -201,7 +201,7 @@ import:py pandas as pd;
 :can:get_prompt_info
 (data_sources: list) {
     st.subheader("Prompt Information");
-    prompt_infos = {data_source:{}  for data_source in data_sources};
+    prompt_infos = {data_source: {}  for data_source in data_sources};
     with st.container(border=True) {
         for data_source in data_sources {
             with st.expander(f"Usecase {data_source}") {

diff --git a/src/components/setup.jac b/src/components/setup.jac
@@ -37,7 +37,7 @@ can setup {
         prompt_infos = get_prompt_info(data_sources);
         if st.button("Save") {
             os.makedirs(".human_eval_config", exist_ok=True);
-            config = {"hv_method":hv_method, "config":{"n_workers":n_workers, "n_questions_per_worker":n_questions_per_worker, "n_options":n_options, "ability_to_tie":ability_to_tie, "data_sources":data_sources, "evenly_distributed":evenly_distributed, "show_captcha":show_captcha, "completion_code":str(uuid.uuid4()) }, "criterias":criterias, "time_created":datetime.now().strftime("%d/%m/%Y %H:%M:%S") };
+            config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "n_options": n_options, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
             st.session_state.config = config;
             json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
             models_responses_all = create_neccessary_file(data_sources, prompt_infos);