Skip to content

Commit

Permalink
chore: Update human evaluation configuration message in human_eval.ja…
Browse files Browse the repository at this point in the history
…c and dashboard.impl.jac
  • Loading branch information
chandralegend committed May 2, 2024
1 parent 4987cf6 commit f2920e0
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 65 deletions.
10 changes: 5 additions & 5 deletions app/src/app.jac
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ can main {
if not st.session_state.admin_privileges {
login();
} else {
(dashboard_tab, generator_tab, evaluation_tab, setup_tab, about_tab) = st.tabs(["Dashboard", "Response Generator", "Auto Evaluator", "Human Eval Setup", "About"]);
(dashboard_tab, generator_tab, setup_tab, evaluation_tab, about_tab) = st.tabs(["Dashboard", "Response Generator", "Evaluation Setup", "Auto Evaluator", "About"]);
with dashboard_tab {
dashboard();
dashboard();
}
with generator_tab {
generator();
}
with evaluation_tab {
auto_eval();
}
with setup_tab {
setup();
}
with evaluation_tab {
auto_eval();
}
with about_tab {
about();
}
Expand Down
10 changes: 7 additions & 3 deletions app/src/components/dashboard/dashboard.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,14 @@ can get_outputs -> tuple {
with open(os.path.join("results", _file), "r") as f {
output = json.load(f);
}
if output["question_index"] == st.session_state.current_hv_config["config"]["n_questions_per_worker"] {
try {
if output["question_index"] == st.session_state.current_hv_config["config"]["n_questions_per_worker"] {
full_outputs.append(_file);
} else {
partial_outputs.append(_file);
} else {
partial_outputs.append(_file);
}
} except Exception {
pass;
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/human_eval/human_eval.jac
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,6 @@ can human_eval {
}
}
} else {
st.error("No human eval config found.");
st.info("Go to Admin Panel and configure the Human Evaluation.");
}
}
63 changes: 27 additions & 36 deletions app/src/components/setup/setup.impl.jac
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''Human Evaluation Setup Implementations'''
'''Evaluation Setup Implementations'''
import:py streamlit as st;
import:py os;
import:py json;
Expand All @@ -13,19 +13,19 @@ import:py pandas as pd;
if os.path.exists(os.path.join(".human_eval_config", "config.json")) {
config = json.load(open(os.path.join(".human_eval_config", "config.json")));
} else {
config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "n_options": 2, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
config = {"hv_method": "A/B Testing", "config": {"n_workers": 100, "n_questions_per_worker": 10, "ability_to_tie": "Allow", "data_sources": None, "evenly_distributed": False, "show_captcha": True, "completion_code": str(uuid.uuid4())}};
}
st.session_state.config = config;
}
}

:can:hv_evaluation_method_selector {
eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the user is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
eval_methods = {"A/B Testing": {"desc": "A/B Testing is a method where the evaluator is shown two options and they have to choose the better one. This is repeated for a number of times and the option with the most votes wins.", "image": "https://i.ibb.co/kXwypbc/image.png"}, "A/B Testing with Criterions": {"desc": "A/B Testing with Criterions is a method where the user is shown two options and they have to choose the better one for each criterion (clarity, fluency, etc.).", "image": "https://i.ibb.co/JF6Q3F3/image.png"}};
criterias = None;
st.subheader("Human Evaluation Method");
st.subheader("Evaluation Methodology");
(hv_method_col, hv_method_view_col) = st.columns(2);
with hv_method_col {
hv_method = st.selectbox("Select the human evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
hv_method = st.selectbox("Select the Evaluation method", list(eval_methods.keys()), index=list(eval_methods.keys()).index(st.session_state.config["hv_method"]));
st.caption(eval_methods[hv_method]["desc"]);
if hv_method == "A/B Testing with Criterions" {
criteria_df = pd.DataFrame([{"Criteria": "clarity", "Description": "Which has better clarity?"}, {"Criteria": "intelligence", "Description": "Which is more intelligent?"}, {"Criteria": "likability", "Description": "Which is more likable?"}, {"Criteria": "trustworthy", "Description": "Which is more trustworthy?"}, {"Criteria": "overall", "Description": "Which is better overall?"}]);
Expand All @@ -50,38 +50,31 @@ import:py pandas as pd;
st.subheader("Human Evaluation Configuration");
(hv_config_1_col, hv_config_2_col, hv_config_3_col) = st.columns(3);
with hv_config_1_col {
n_options = st.number_input("Number of options", min_value=2, max_value=5, value=st.session_state.config["config"]["n_options"], step=1);
n_workers = st.number_input("Number of workers", min_value=10, step=1, value=st.session_state.config["config"]["n_workers"]);
n_questions_per_worker = st.number_input("Number of questions per worker", min_value=2, max_value=100, step=1, value=st.session_state.config["config"]["n_questions_per_worker"]);
show_captcha = st.checkbox("Show Captcha", value=st.session_state.config["config"]["show_captcha"]);
n_workers = st.number_input("Number of workers", min_value=10, step=1, value=st.session_state.config["config"]["n_workers"], help="Number of Evaluators going to participate");
n_questions_per_worker = st.number_input("Number of questions per worker", min_value=2, max_value=100, step=1, value=st.session_state.config["config"]["n_questions_per_worker"], help="Number of questions shown to an Evaluator");
show_captcha = st.checkbox("Show Captcha (Human Verification)", value=st.session_state.config["config"]["show_captcha"]);
}
with hv_config_2_col {
json_files = [f for f in os.listdir("data") if f.endswith(".json")] if os.path.exists("data") else [];
data_sources = st.multiselect("Data sources (Usecases)", json_files, default=st.session_state.config["config"]["data_sources"]);
ability_to_tie = st.selectbox("Ability to tie", ["Allow", "Not Allowed"], index=["Allow", "Not Allowed"].index(st.session_state.config["config"]["ability_to_tie"])) if n_options == 2 else "Not Allowed";
evenly_distributed = st.checkbox("Usecases are Evenly distributed among the workers", value=st.session_state.config["config"]["evenly_distributed"]);
data_sources = st.multiselect("Data sources (Usecases)", json_files, default=st.session_state.config["config"]["data_sources"], help="Select the data sources for the evaluation. Each file should represent a usecase (sigle prompt) you want to evaluate.");
ability_to_tie = st.selectbox("Ability to Choose Both", ["Allow", "Not Allowed"], index=["Allow", "Not Allowed"].index(st.session_state.config["config"]["ability_to_tie"]), help="Select whether the evaluator can choose both options as the same.");
evenly_distributed = st.checkbox("Usecases are Evenly distributed among the workers", value=st.session_state.config["config"]["evenly_distributed"], help="If checked, the usecases will be evenly distributed among the workers. for example, if there are 2 usecases and 10 workers, each worker will get 1 question from each usecase. If not checked, the questions will be randomly distributed.");
}
with hv_config_3_col {
if n_options == 2 {
n_models = st.number_input("Number of models", min_value=2, max_value=100, step=1);
n_usecases = st.number_input("Number of usecases", min_value=1, step=1);
try {
(min_n_responses_needed_per_model, n_model_pairs) = sanity_check(n_models, n_usecases, n_workers, n_questions_per_worker);
st.write("Minimum number of responses needed per model per usecase for even distribution: ", min_n_responses_needed_per_model);
st.write("Number of model pairs: ", n_model_pairs);
} except ZeroDivisionError {
st.caption("Try Changing the number of models or the number of questions per worker.");
st.caption("Following is to check if the configuration is valid.");
n_models = st.number_input("Number of models", min_value=2, max_value=100, step=1, help="Number of models you want to evaluate.");
n_responses_per_model = st.number_input("Number of responses per model", min_value=1, max_value=100, step=1, help="Number of responses per model in the data sources.");
try {
(min_n_responses_needed_per_model, n_model_pairs) = sanity_check(n_models, len(data_sources), n_workers, n_questions_per_worker);
st.write("Minimum number of responses needed per model per usecase for even distribution: ", min_n_responses_needed_per_model);
if n_responses_per_model < min_n_responses_needed_per_model {
st.warning("Number of responses per model is less than the minimum required for even distribution. Try increasing the number of responses per model.");
}
} else {
st.caption("The calculator is not available for more than 2 options.");
}
}
if evenly_distributed {
if not n_questions_per_worker % n_usecases == 0 {
st.warning("The number of questions per worker should be divisible by the number of usecases for even distribution.");
} except Exception {
st.warning("Configuration is not valid. Please check the values.");
}
}
return (n_options, n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha);
return (n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha);
}

:can:add_data_sources {
Expand All @@ -99,7 +92,7 @@ import:py pandas as pd;
}
}

:can:get_question_pairs(models_responses_all: dict) {
:can:get_question_pairs(models_responses_all: dict) -> list {
n_usecases = len(models_responses_all);
models = list(list(models_responses_all.values())[0].keys());
random.shuffle(models);
Expand All @@ -119,11 +112,10 @@ import:py pandas as pd;
}
}
}
json.dump(question_pairs, open(os.path.join(".human_eval_config", "unique_question_pairs.json"), "w"), indent=2);
return question_pairs;
}

:can:get_distribution(question_pairs: list, n_workers: int, n_questions_per_worker: int) {
:can:get_distribution(question_pairs: list, n_workers: int, n_questions_per_worker: int) -> dict {
need_to_be_evenly_distributed = st.session_state.config["config"]["evenly_distributed"];
if not need_to_be_evenly_distributed {
selected_question_pairs = random.sample(question_pairs, n_workers * n_questions_per_worker);
Expand Down Expand Up @@ -154,8 +146,7 @@ import:py pandas as pd;
}
}
}
json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.json"), "w"), indent=2);
json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.bak.json"), "w"), indent=2);
return distribution;
}

:can:create_neccessary_file(data_sources: list, _prompt_infos: dict) {
Expand Down Expand Up @@ -193,11 +184,11 @@ import:py pandas as pd;
}

:can:get_prompt_info(data_sources: list) {
st.subheader("Prompt Information");
prompt_infos = {data_source: {} for data_source in data_sources};
with st.container(border=True) {
st.subheader("Prompt Information");
for data_source in data_sources {
with st.expander(f"Usecase {data_source}") {
with st.expander(f"Usecase {data_source}", expanded=True) {
with open(os.path.join("data", data_source)) as f {
data = json.load(f);
}
Expand Down
53 changes: 34 additions & 19 deletions app/src/components/setup/setup.jac
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
'''Human Evaluation Setup'''
'''Evaluation Setup'''
import:py from datetime, datetime;
import:py json;
import:py os;
import:py shutil;
import:py streamlit as st;
import:py uuid;

Expand All @@ -17,10 +18,10 @@ can get_distribution(question_pairs: list, n_workers: int, n_questions_per_worke
'''Initializing the Setup Config.'''
can <>init;

'''Human Evaluation Method Selection View'''
'''Evaluation Method Selection View'''
can hv_evaluation_method_selector;

'''Human Evaluation Configurator'''
'''Evaluation Configurator'''
can hv_configurator;

'''Retrieve Prompt Information from the datasources show in a Editable View'''
Expand All @@ -29,34 +30,48 @@ can get_prompt_info(data_sources: list) -> dict;
'''Add Data Sources to the Configurator'''
can add_data_sources;

'''Create Necessary Files for the Human Evaluation'''
'''Create Necessary Files for the Evaluation'''
can create_neccessary_file(data_sources: list, _prompt_infos: dict);

'''Human Evaluation Setup Component'''
'''Evaluation Setup Component'''
can setup {
<>init();
st.header("Human Evaluation Setup");
st.caption("This is the setup page for the human eval. You can change the configuration and then click save to save the configuration.");
st.header("Evaluation Setup");
st.caption("This setup will help you to configure the evaluation configuration for both human and auto evaluation.");
with st.container(border=True) {
(hv_method, criterias) = hv_evaluation_method_selector();
}
with st.container(border=True) {
(n_options, n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha) = hv_configurator();
(n_workers, ability_to_tie, data_sources, n_questions_per_worker, evenly_distributed, show_captcha) = hv_configurator();
add_data_sources();
}
if data_sources {
prompt_infos = get_prompt_info(data_sources);
if st.button("Save") {
os.makedirs(".human_eval_config", exist_ok=True);
config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "n_options": n_options, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
st.session_state.config = config;
json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
json.dump(config, open(os.path.join(".human_eval_config", "config.bak.json"), "w"));
models_responses_all = create_neccessary_file(data_sources, prompt_infos);
question_pairs = get_question_pairs(models_responses_all);
get_distribution(question_pairs, n_workers, n_questions_per_worker);
st.toast("Created necessary files!");
st.toast("Saved!");
if st.button("Create Evaluation Configuration") {
try {
os.makedirs(".human_eval_config", exist_ok=True);

assert not evenly_distributed or n_questions_per_worker % len(data_sources) == 0, "Number of questions per worker should be divisible by the number of Usecases for an evenly distributed evaluation.";

config = {"hv_method": hv_method, "config": {"n_workers": n_workers, "n_questions_per_worker": n_questions_per_worker, "ability_to_tie": ability_to_tie, "data_sources": data_sources, "evenly_distributed": evenly_distributed, "show_captcha": show_captcha, "completion_code": str(uuid.uuid4())}, "criterias": criterias, "time_created": datetime.now().strftime("%d/%m/%Y %H:%M:%S")};
st.session_state.config = config;

models_responses_all = create_neccessary_file(data_sources, prompt_infos);
question_pairs = get_question_pairs(models_responses_all);
distribution = get_distribution(question_pairs, n_workers, n_questions_per_worker);

json.dump(config, open(os.path.join(".human_eval_config", "config.json"), "w"));
json.dump(config, open(os.path.join(".human_eval_config", "config.bak.json"), "w"));
json.dump(question_pairs, open(os.path.join(".human_eval_config", "unique_question_pairs.json"), "w"), indent=2);
json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.json"), "w"), indent=2);
json.dump(distribution, open(os.path.join(".human_eval_config", "distribution.bak.json"), "w"), indent=2);

st.toast("Created necessary files!");
st.toast("Human Evaluation is Initialized!");
} except Exception as e {
st.error(str(e));
shutil.rmtree(".human_eval_config");
}
}
} else {
st.warning("Please upload at least one data source. or select one from the list if available.");
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/theme.jac
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ can initPage(page_title: str) -> None {
can footer {
dir_root = os.path.dirname(os.path.abspath(__file__));
local_css(os.path.join(dir_root, "../assets/footer.css"));
footer = "<div class='footer'><p>Developed by <a href='https://www.jaseci.org/'' target='_blank'><img src='https://www.jaseci.org/wp-content/uploads/2022/02/jaseki-logo-inverted-rgb.svg' alt='Jaseci Logo' height='40'></a></p></div>";
footer = "<div class='footer' style='background-color: rgb(14, 17, 23);'><p style='color: white; margin: 5px;'>Developed by <a href='https://www.jaseci.org/'' target='_blank'><img src='https://www.jaseci.org/wp-content/uploads/2022/02/jaseki-logo-inverted-rgb.svg' alt='Jaseci Logo' height='40'></a></p></div>";
st.markdown(footer, unsafe_allow_html=True);
}

0 comments on commit f2920e0

Please sign in to comment.