chore: Update human evaluation configuration message in human_eval.ja…

…c and dashboard.impl.jac
Jaseci-Labs · May 2, 2024 · f2920e0 · f2920e0
1 parent 4987cf6
commit f2920e0
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 65 deletions.
diff --git a/app/src/app.jac b/app/src/app.jac
@@ -23,19 +23,19 @@ can main {
         if not st.session_state.admin_privileges {
             login();
         } else {
-            (dashboard_tab, generator_tab, evaluation_tab, setup_tab, about_tab) = st.tabs(["Dashboard", "Response Generator", "Auto Evaluator", "Human Eval Setup", "About"]);
+            (dashboard_tab, generator_tab, setup_tab, evaluation_tab, about_tab) = st.tabs(["Dashboard", "Response Generator", "Evaluation Setup", "Auto Evaluator", "About"]);
             with dashboard_tab {
-dashboard();
+                dashboard();
             }
             with generator_tab {
                 generator();
             }
-            with evaluation_tab {
-                auto_eval();
-            }
             with setup_tab {
                 setup();
             }
+            with evaluation_tab {
+                auto_eval();
+            }
             with about_tab {
                 about();
             }

diff --git a/app/src/components/dashboard/dashboard.impl.jac b/app/src/components/dashboard/dashboard.impl.jac
@@ -77,10 +77,14 @@ can get_outputs -> tuple {
             with open(os.path.join("results", _file), "r") as f {
                 output = json.load(f);
             }
-            if output["question_index"] == st.session_state.current_hv_config["config"]["n_questions_per_worker"] {
+            try {
+                if output["question_index"] == st.session_state.current_hv_config["config"]["n_questions_per_worker"] {
                 full_outputs.append(_file);
-            } else {
-                partial_outputs.append(_file);
+                } else {
+                    partial_outputs.append(_file);
+                }
+            } except Exception {
+                pass;
             }
         }
     }

diff --git a/app/src/components/human_eval/human_eval.jac b/app/src/components/human_eval/human_eval.jac
@@ -89,6 +89,6 @@ can human_eval {
             }
         }
     } else {
-        st.error("No human eval config found.");
+        st.info("Go to Admin Panel and configure the Human Evaluation.");
     }
 }
diff --git a/app/src/components/setup/setup.impl.jac b/app/src/components/setup/setup.impl.jac
@@ -1,4 +1,4 @@
-'''Human Evaluation Setup Implementations'''
+'''Evaluation Setup Implementations'''
 import:py streamlit as st;
 import:py os;
 import:py json;
@@ -13,19 +13,19 @@ import:py pandas as pd;
         if os.path.exists(os.path.join(".human_eval_config", "config.json")) {
             config = json.load(open(os.path.join(".human_eval_config", "config.json")));
         } else {
-            config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "n_options": 2, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
+            config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
         }
         st.session_state.config = config;
     }
 }
 
 :can:hv_evaluation_method_selector {
-    eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
+    eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the evaluator is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
     criterias = None;
-    st.subheader("Human Evaluation Method");
+    st.subheader("Evaluation Methodology");
     (hv_method_col, hv_method_view_col) = st.columns(2);
     with hv_method_col {
-        hv_method = st.selectbox("Select the human evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
+        hv_method = st.selectbox("Select the Evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
         st.caption(eval_methods[hv_method]["desc"]);
         if hv_method == "A/B Testing with Criterions" {
             criteria_df = pd.DataFrame([{"Criteria": "clarity", "Description": "Which has better clarity?"}, {"Criteria": "intelligence", "Description": "Which is more intelligent?"}, {"Criteria": "likability", "Description": "Which is more likable?"}, {"Criteria": "trustworthy", "Description": "Which is more trustworthy?"}, {"Criteria": "overall", "Description": "Which is better overall?"}]);
@@ -50,38 +50,31 @@ import:py pandas as pd;
     st.subheader("Human Evaluation Configuration");
     (hv_config_1_col, hv_config_2_col, hv_config_3_col) = st.columns(3);
     with hv_config_1_col {
-        n_options = st.number_input("Number of options", min_value=2, max_value=5, value=st.session_state.config["config"]["n_options"], step=1);
-        n_workers = st.number_input("Number of workers", min_value=10, step=1, value=st.session_state.config["config"]["n_workers"]);
-        n_questions_per_worker = st.number_input("Number of questions per worker", min_value=2, max_value=100, step=1, value=st.session_state.config["config"]["n_questions_per_worker"]);
-        show_captcha = st.checkbox("Show Captcha", value=st.session_state.config["config"]["show_captcha"]);
+        n_workers = st.number_input("Number of workers", min_value=10, step=1, value=st.session_state.config["config"]["n_workers"], help="Number of Evaluators going to participate");
+        n_questions_per_worker = st.number_input("Number of questions per worker", min_value=2, max_value=100, step=1, value=st.session_state.config["config"]["n_questions_per_worker"], help="Number of questions shown to an Evaluator");
+        show_captcha = st.checkbox("Show Captcha (Human Verification)", value=st.session_state.config["config"]["show_captcha"]);
     }
     with hv_config_2_col {
         json_files = [f  for f in os.listdir("data") if f.endswith(".json")] if os.path.exists("data") else [];
-        data_sources = st.multiselect("Data sources (Usecases)", json_files, default=st.session_state.config["config"]["data_sources"]);
-        ability_to_tie = st.selectbox("Ability to tie", ["Allow", "Not Allowed"], index=["Allow", "Not Allowed"].index(st.session_state.config["config"]["ability_to_tie"])) if n_options == 2 else "Not Allowed";
-        evenly_distributed = st.checkbox("Usecases are Evenly distributed among the workers", value=st.session_state.config["config"]["evenly_distributed"]);
+        data_sources = st.multiselect("Data sources (Usecases)", json_files, default=st.session_state.config["config"]["data_sources"], help="Select the data sources for the evaluation. Each file should represent a usecase (sigle prompt) you want to evaluate.");
+        ability_to_tie = st.selectbox("Ability to Choose Both", ["Allow", "Not Allowed"], index=["Allow", "Not Allowed"].index(st.session_state.config["config"]["ability_to_tie"]), help="Select whether the evaluator can choose both options as the same.");
+        evenly_distributed = st.checkbox("Usecases are Evenly distributed among the workers", value=st.session_state.config["config"]["evenly_distributed"], help="If checked, the usecases will be evenly distributed among the workers. for example, if there are 2 usecases and 10 workers, each worker will get 1 question from each usecase. If not checked, the questions will be randomly distributed.");
     }
     with hv_config_3_col {
-        if n_options == 2 {
-            n_models = st.number_input("Number of models", min_value=2, max_value=100, step=1);
-            n_usecases = st.number_input("Number of usecases", min_value=1, step=1);
-            try  {
-                (min_n_responses_needed_per_model, n_model_pairs) = sanity_check(n_models, n_usecases, n_workers, n_questions_per_worker);
-                st.write("Minimum number of responses needed per model per usecase for even distribution: ", min_n_responses_needed_per_model);
-                st.write("Number of model pairs: ", n_model_pairs);
-            } except ZeroDivisionError  {
-                st.caption("Try Changing the number of models or the number of questions per worker.");
+        st.caption("Following is to check if the configuration is valid.");
+        n_models = st.number_input("Number of models", min_value=2, max_value=100, step=1, help="Number of models you want to evaluate.");
+        n_responses_per_model = st.number_input("Number of responses per model", min_value=1, max_value=100, step=1, help="Number of responses per model in the data sources.");
+        try  {
+            (min_n_responses_needed_per_model, n_model_pairs) = sanity_check(n_models, len(data_sources), n_workers, n_questions_per_worker);
+            st.write("Minimum number of responses needed per model per usecase for even distribution: ", min_n_responses_needed_per_model);
+            if n_responses_per_model < min_n_responses_needed_per_model {
+                st.warning("Number of responses per model is less than the minimum required for even distribution. Try increasing the number of responses per model.");
             }
-        } else {
-            st.caption("The calculator is not available for more than 2 options.");
-        }
-    }
-    if evenly_distributed {
-        if not n_questions_per_worker % n_usecases == 0 {
-            st.warning("The number of questions per worker should be divisible by the number of usecases for even distribution.");
+        } except Exception {
+            st.warning("Configuration is not valid. Please check the values.");
         }
     }
-    return (n_options, n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha);
+    return (n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha);
 }
 
 :can:add_data_sources {
@@ -99,7 +92,7 @@ import:py pandas as pd;
     }
 }
 
-:can:get_question_pairs(models_responses_all: dict) {
+:can:get_question_pairs(models_responses_all: dict) -> list {
     n_usecases = len(models_responses_all);
     models = list(list(models_responses_all.values())[0].keys());
     random.shuffle(models);
@@ -119,11 +112,10 @@ import:py pandas as pd;
             }
         }
     }
-    json.dump(question_pairs, open(os.path.join(".human_eval_config", "unique_question_pairs.json"), "w"), indent=2);
     return question_pairs;
 }
 
-:can:get_distribution(question_pairs: list, n_workers: int, n_questions_per_worker: int) {
+:can:get_distribution(question_pairs: list, n_workers: int, n_questions_per_worker: int) -> dict {
     need_to_be_evenly_distributed = st.session_state.config["config"]["evenly_distributed"];
     if not need_to_be_evenly_distributed {
         selected_question_pairs = random.sample(question_pairs, n_workers * n_questions_per_worker);
@@ -154,8 +146,7 @@ import:py pandas as pd;
             }
         }
     }
-    json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.json"), "w"), indent=2);
-    json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.bak.json"), "w"), indent=2);
+    return distribution;
 }
 
 :can:create_neccessary_file(data_sources: list, _prompt_infos: dict) {
@@ -193,11 +184,11 @@ import:py pandas as pd;
 }
 
 :can:get_prompt_info(data_sources: list) {
-    st.subheader("Prompt Information");
     prompt_infos = {data_source: {}  for data_source in data_sources};
     with st.container(border=True) {
+        st.subheader("Prompt Information");
         for data_source in data_sources {
-            with st.expander(f"Usecase {data_source}") {
+            with st.expander(f"Usecase {data_source}", expanded=True) {
                 with open(os.path.join("data", data_source)) as f {
                     data = json.load(f);
                 }

diff --git a/app/src/components/setup/setup.jac b/app/src/components/setup/setup.jac
@@ -1,7 +1,8 @@
-'''Human Evaluation Setup'''
+'''Evaluation Setup'''
 import:py from datetime, datetime;
 import:py json;
 import:py os;
+import:py shutil;
 import:py streamlit as st;
 import:py uuid;
 
@@ -17,10 +18,10 @@ can get_distribution(question_pairs: list, n_workers: int, n_questions_per_worke
 '''Initializing the Setup Config.'''
 can <>init;
 
-'''Human Evaluation Method Selection View'''
+'''Evaluation Method Selection View'''
 can hv_evaluation_method_selector;
 
-'''Human Evaluation Configurator'''
+'''Evaluation Configurator'''
 can hv_configurator;
 
 '''Retrieve Prompt Information from the datasources show in a Editable View'''
@@ -29,34 +30,48 @@ can get_prompt_info(data_sources: list) -> dict;
 '''Add Data Sources to the Configurator'''
 can add_data_sources;
 
-'''Create Necessary Files for the Human Evaluation'''
+'''Create Necessary Files for the Evaluation'''
 can create_neccessary_file(data_sources: list, _prompt_infos: dict);
 
-'''Human Evaluation Setup Component'''
+'''Evaluation Setup Component'''
 can setup {
     <>init();
-    st.header("Human Evaluation Setup");
-    st.caption("This is the setup page for the human eval. You can change the configuration and then click save to save the configuration.");
+    st.header("Evaluation Setup");
+    st.caption("This setup will help you to configure the evaluation configuration for both human and auto evaluation.");
     with st.container(border=True) {
         (hv_method, criterias) = hv_evaluation_method_selector();
     }
     with st.container(border=True) {
-        (n_options, n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha) = hv_configurator();
+        (n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha) = hv_configurator();
         add_data_sources();
     }
     if data_sources {
         prompt_infos = get_prompt_info(data_sources);
-        if st.button("Save") {
-            os.makedirs(".human_eval_config", exist_ok=True);
-            config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "n_options": n_options, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
-            st.session_state.config = config;
-            json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
-            json.dump(config, open(os.path.join(".human_eval_config", "config.bak.json"), "w"));
-            models_responses_all = create_neccessary_file(data_sources, prompt_infos);
-            question_pairs = get_question_pairs(models_responses_all);
-            get_distribution(question_pairs, n_workers, n_questions_per_worker);
-            st.toast("Created necessary files!");
-            st.toast("Saved!");
+        if st.button("Create Evaluation Configuration") {
+            try {
+                os.makedirs(".human_eval_config", exist_ok=True);
+
+                assert not evenly_distributed or n_questions_per_worker % len(data_sources) == 0, "Number of questions per worker should be divisible by the number of Usecases for an evenly distributed evaluation.";
+
+                config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
+                st.session_state.config = config;
+
+                models_responses_all = create_neccessary_file(data_sources, prompt_infos);
+                question_pairs = get_question_pairs(models_responses_all);
+                distribution = get_distribution(question_pairs, n_workers, n_questions_per_worker);
+
+                json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
+                json.dump(config, open(os.path.join(".human_eval_config", "config.bak.json"), "w"));
+                json.dump(question_pairs, open(os.path.join(".human_eval_config", "unique_question_pairs.json"), "w"), indent=2);
+                json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.json"), "w"), indent=2);
+                json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.bak.json"), "w"), indent=2);
+
+                st.toast("Created necessary files!");
+                st.toast("Human Evaluation is Initialized!");
+            } except Exception as e {
+                st.error(str(e));
+                shutil.rmtree(".human_eval_config");
+            }
         }
     } else {
         st.warning("Please upload at least one data source. or select one from the list if available.");

diff --git a/app/src/components/theme.jac b/app/src/components/theme.jac
@@ -29,6 +29,6 @@ can initPage(page_title: str) -> None {
 can footer {
     dir_root = os.path.dirname(os.path.abspath(__file__));
     local_css(os.path.join(dir_root, "../assets/footer.css"));
-    footer = "<div class='footer'><p>Developed by <a href='https://www.jaseci.org/'' target='_blank'><img src='https://www.jaseci.org/wp-content/uploads/2022/02/jaseki-logo-inverted-rgb.svg' alt='Jaseci Logo' height='40'></a></p></div>";
+    footer = "<div class='footer' style='background-color: rgb(14, 17, 23);'><p style='color: white; margin: 5px;'>Developed by <a href='https://www.jaseci.org/'' target='_blank'><img src='https://www.jaseci.org/wp-content/uploads/2022/02/jaseki-logo-inverted-rgb.svg' alt='Jaseci Logo' height='40'></a></p></div>";
     st.markdown(footer, unsafe_allow_html=True);
 }