Skip to content

Commit

Permalink
Add install fail logging
Browse files Browse the repository at this point in the history
  • Loading branch information
john-b-yang committed Apr 1, 2024
1 parent 68e89ef commit 58d24d1
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 68 deletions.
29 changes: 29 additions & 0 deletions swebench/metrics/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from enum import Enum

# Evaluation Log Constants
APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
APPLY_PATCH_PASS = ">>>>> Applied Patch"
INSTALL_FAIL = ">>>>> Init Failed"
INSTALL_PASS = ">>>>> Init Succeeded"
RESET_FAILED = ">>>>> Reset Failed"
TESTS_ERROR = ">>>>> Tests Errored"
TESTS_TIMEOUT = ">>>>> Tests Timed Out"

# Result Categories
FAIL_TO_PASS = "FAIL_TO_PASS"
FAIL_TO_FAIL = "FAIL_TO_FAIL"
PASS_TO_PASS = "PASS_TO_PASS"
PASS_TO_FAIL = "PASS_TO_FAIL"

# Test Status Enum
class TestStatus(Enum):
FAILED = "FAILED"
PASSED = "PASSED"
SKIPPED = "SKIPPED"
ERROR = "ERROR"

# Resolved Status Enum
class ResolvedStatus(Enum):
NO = "RESOLVED_NO"
PARTIAL = "RESOLVED_PARTIAL"
FULL = "RESOLVED_FULL"
13 changes: 8 additions & 5 deletions swebench/metrics/conversion.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import json, os

from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus
from swebench.metrics.getters import (
get_file_name_from_lp,
get_repo_from_lp,
log_path_to_sms,
from swebench.metrics.constants import (
FAIL_TO_PASS,
FAIL_TO_FAIL,
PASS_TO_PASS,
PASS_TO_FAIL,
TestStatus,
)
from swebench.metrics.getters import (
get_file_name_from_lp,
get_repo_from_lp,
log_path_to_sms,
test_failed,
test_passed,
)
from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER


def convert_log_to_ground_truth(
Expand Down
30 changes: 11 additions & 19 deletions swebench/metrics/getters.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
import re

from swebench.metrics.constants import (
APPLY_PATCH_FAIL,
APPLY_PATCH_PASS,
RESET_FAILED,
TESTS_ERROR,
TESTS_TIMEOUT,
)
from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus


# Evaluation Log Constants
APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
APPLY_PATCH_PASS = ">>>>> Applied Patch"
INSTALL_FAIL = ">>>>> Init Failed"
INSTALL_PASS = ">>>>> Init Succeeded"
RESET_FAILED = ">>>>> Reset Failed"
TESTS_TIMEOUT = ">>>>> Tests Timed Out"
TESTS_ERROR = ">>>>> Tests Errored"

# Result Categories
FAIL_TO_PASS = "FAIL_TO_PASS"
FAIL_TO_FAIL = "FAIL_TO_FAIL"
PASS_TO_PASS = "PASS_TO_PASS"
PASS_TO_FAIL = "PASS_TO_FAIL"
from typing import Tuple


def get_diffs(sm_1: dict, sm_2: dict) -> dict:
Expand All @@ -41,7 +33,7 @@ def get_diffs(sm_1: dict, sm_2: dict) -> dict:
return diff_map


def get_logs_eval(log_fp: str) -> (dict, bool):
def get_logs_eval(log_fp: str) -> Tuple[dict, bool]:
"""
Retrieve evaluation results for a task instance from its corresponding log file
Expand All @@ -65,7 +57,7 @@ def get_logs_eval(log_fp: str) -> (dict, bool):
return log_parser(content), True


def get_logs_gold(log_fp: str) -> (str, str):
def get_logs_gold(log_fp: str) -> Tuple[str, str]:
"""
Retrieve pre-patch, post-patch test logs from a validation log file
Expand All @@ -92,7 +84,7 @@ def get_logs_gold(log_fp: str) -> (str, str):
get_repo_from_lp = lambda x: get_id_from_lp(x).rsplit("-", 1)[0].replace("__", "/")


def log_path_to_sms(log_fp: str, log_parser) -> (list, bool):
def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]:
"""
Wrapper for getting log data from log_parser file
Expand Down
9 changes: 1 addition & 8 deletions swebench/metrics/log_parsers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
import re

from enum import Enum


class TestStatus(Enum):
FAILED = "FAILED"
PASSED = "PASSED"
SKIPPED = "SKIPPED"
ERROR = "ERROR"
from swebench.metrics.constants import TestStatus


def parse_log_pytest(log: str) -> dict:
Expand Down
16 changes: 5 additions & 11 deletions swebench/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
from enum import Enum
from statistics import mean
from swebench.metrics.getters import (
FAIL_TO_FAIL, FAIL_TO_PASS,
PASS_TO_FAIL, PASS_TO_PASS,
from swebench.metrics.constants import (
FAIL_TO_PASS,
PASS_TO_PASS,
ResolvedStatus,
)


class ResolvedStatus(Enum):
NO = "RESOLVED_NO"
PARTIAL = "RESOLVED_PARTIAL"
FULL = "RESOLVED_FULL"


def compute_fail_to_pass(report: dict) -> float:
"""
Compute fail-to-pass metric. Accepts single report as argument.
Expand Down Expand Up @@ -94,4 +88,4 @@ def get_resolution_status(report: dict) -> str:
elif f2p < 1 and f2p > 0 and p2p == 1:
return ResolvedStatus.PARTIAL.value
else:
return ResolvedStatus.NO.value
return ResolvedStatus.NO.value
15 changes: 11 additions & 4 deletions swebench/metrics/monitor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
import glob
import os

from swebench.metrics.constants import (
APPLY_PATCH_FAIL,
APPLY_PATCH_PASS,
TESTS_TIMEOUT
)
from swebench.metrics.getters import (
log_path_to_sms, get_diffs, get_repo_from_lp,
APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT
log_path_to_sms,
get_diffs,
get_repo_from_lp,
)
from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER
from typing import Tuple


def monitor_validation(
path_to_logs: str, log_prefix: str = None
) -> (list, list, list, list):
) -> Tuple[list, list, list, list]:
"""
Check log files generated from a `check_instances` run to see how many instances were successfully
installed and/or tested.
Expand Down Expand Up @@ -79,7 +86,7 @@ def monitor_validation(
return failed_install, corrupt_test_patch, corrupt_patch, timeout, success


def monitor_logs_same_diff(log_dir: str, repo: str = None) -> (list, list):
def monitor_logs_same_diff(log_dir: str, repo: str = None) -> Tuple[list, list]:
"""
Given a log directory and repo, return a list of logs where pre-test
and post-test logs are same/different
Expand Down
55 changes: 34 additions & 21 deletions swebench/metrics/report.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
import glob, json, os

from collections import Counter
from swebench.metrics.getters import (
get_file_name_from_lp,
get_logs_eval,
get_id_from_lp,
from swebench.harness.constants import (
INSTALL_FAIL,
KEY_INSTANCE_ID,
)
from swebench.metrics.constants import (
FAIL_TO_FAIL,
FAIL_TO_PASS,
PASS_TO_FAIL,
PASS_TO_PASS,
)
from swebench.metrics.getters import (
get_file_name_from_lp,
get_logs_eval,
get_id_from_lp,
test_failed,
test_passed,
)
from swebench.metrics.log_parsers import TestStatus
from swebench.metrics.metrics import (
compute_fail_to_pass_unweighted,
compute_fail_to_pass_weighted,
compute_pass_to_pass_unweighted,
compute_pass_to_pass_weighted,
get_resolution_status,
ResolvedStatus,
)
from typing import Tuple


### MARK - Eval Report Generation
Expand Down Expand Up @@ -119,7 +126,7 @@ def get_eval_reports_for_logs(
swe_bench_tasks: str,
callback: callable = None,
verbose: bool = False,
) -> (dict, dict):
) -> Tuple[dict, dict]:
"""
Wrapper for getting eval report for a list of evaluation log paths.
Expand All @@ -135,7 +142,7 @@ def get_eval_reports_for_logs(
reports_patch_success = {}
reports_patch_failure = {}
eval_refs = json.load(open(swe_bench_tasks))
eval_refs = {t['instance_id']: t for t in eval_refs}
eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs}

for eval_log in eval_logs:
# Remove task instances that do not satisfy callback
Expand Down Expand Up @@ -194,7 +201,7 @@ def get_model_eval_summary(
eval_dir: str,
swe_bench_tasks: str,
repo: str = None,
):
) -> dict:
"""
Generate a summary of model evaluation results.
Expand All @@ -213,7 +220,7 @@ def get_model_eval_summary(
# Filter by repo if provided
criteria_eval_sm = None
if repo is not None:
criteria_pred = lambda pred: repo in pred["instance_id"]
criteria_pred = lambda pred: repo in pred[KEY_INSTANCE_ID]
criteria_eval_sm = lambda eval_log: repo in eval_log
preds = [x for x in preds if criteria_pred(x)]

Expand Down Expand Up @@ -257,7 +264,7 @@ def get_model_eval_summary(

def get_model_report(
model: str, predictions_path: str, swe_bench_tasks: str, log_dir: str
):
) -> dict:
"""
Generate a report of model evaluation results from predictions, task instances,
and evaluation logs.
Expand All @@ -271,8 +278,8 @@ def get_model_report(
report_map (dict): map of repo to report
"""
eval_refs = json.load(open(swe_bench_tasks))
eval_refs = [{key: t[key] for key in ["instance_id", "FAIL_TO_PASS", "PASS_TO_PASS"]} for t in eval_refs]
eval_refs = {t['instance_id']: t for t in eval_refs}
eval_refs = [{key: t[key] for key in [KEY_INSTANCE_ID, FAIL_TO_PASS, PASS_TO_PASS]} for t in eval_refs]
eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs}

# Get predictions
predictions = []
Expand All @@ -286,37 +293,43 @@ def get_model_report(

# Iterate through predictions
for p in predictions:
repo = p["instance_id"].split(".")[0].rsplit("-", 1)[0].replace("__", "/")
repo = p[KEY_INSTANCE_ID].split(".")[0].rsplit("-", 1)[0].replace("__", "/")
if repo not in report_map:
report_map[repo] = {
"none": [],
"generated": [],
"with_logs": [],
"install_fail": [],
"applied": [],
"resolved": [],
}

# Check if the model patch exists
if p["model_patch"] == None:
report_map[repo]["none"].append(p['instance_id'])
report_map[repo]["none"].append(p[KEY_INSTANCE_ID])
continue
report_map[repo]["generated"].append(p['instance_id'])
report_map[repo]["generated"].append(p[KEY_INSTANCE_ID])

# Get log file
log_path = os.path.join(log_dir, f"{p['instance_id']}.{model}.eval.log")
log_path = os.path.join(log_dir, f"{p[KEY_INSTANCE_ID]}.{model}.eval.log")
if not os.path.exists(log_path):
continue
report_map[repo]["with_logs"].append(p['instance_id'])
report_map[repo]["with_logs"].append(p[KEY_INSTANCE_ID])

# Check if install succeeded
if INSTALL_FAIL in open(log_path).read():
report_map[repo]["install_fail"].append(p[KEY_INSTANCE_ID])
continue

# Get evaluation logs
eval_sm, found = get_logs_eval(log_path)

if not found:
continue
report_map[repo]["applied"].append(p['instance_id'])
report_map[repo]["applied"].append(p[KEY_INSTANCE_ID])

report = get_eval_report(eval_sm, eval_refs[p["instance_id"]])
if get_resolution_status(report) == "RESOLVED_FULL":
report_map[repo]["resolved"].append(p['instance_id'])
report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
if get_resolution_status(report) == ResolvedStatus.FULL.value:
report_map[repo]["resolved"].append(p[KEY_INSTANCE_ID])

return report_map

0 comments on commit 58d24d1

Please sign in to comment.