Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LocalModuleTest.test_load_metric_code_eval fails with "The "code_eval" metric executes untrusted model-generated code in Python." #597

Open
jpodivin opened this issue Jun 8, 2024 · 0 comments

Comments

@jpodivin
Copy link

jpodivin commented Jun 8, 2024

Environment:

Python 3.8
evaluate: a4bdc10
OS: Ubuntu 24

Trace:

=========================================================== FAILURES ============================================================
__________________________________________ LocalModuleTest.test_load_metric_code_eval ___________________________________________
[gw0] linux -- Python 3.8.19 /var/git/repos/evaluate/.venv/bin/python

self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval'
evaluation_module_type = 'metric'

    def test_load(self, evaluation_module_name, evaluation_module_type):
        doctest.ELLIPSIS_MARKER = "[...]"
        evaluation_module = importlib.import_module(
            evaluate.loading.evaluation_module_factory(
                os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type
            ).module_path
        )
        evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__)
        # check parameters
        parameters = inspect.signature(evaluation_instance._compute).parameters
        self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()]))  # no **kwargs
        # run doctest
        with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__):
            with self.use_local_metrics(evaluation_module_type):
                try:
>                   results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True)

tests/test_metric_common.py:117: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.8/doctest.py:1956: in testmod
    runner.run(test)
/usr/lib/python3.8/doctest.py:1844: in run
    r = DocTestRunner.run(self, test, compileflags, out, False)
/usr/lib/python3.8/doctest.py:1483: in run
    return self.__run(test, compileflags, out)
/usr/lib/python3.8/doctest.py:1388: in __run
    self.report_unexpected_exception(out, test, example,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <doctest.DebugRunner object at 0x785042c7ebe0>
out = <built-in method write of _io.TextIOWrapper object at 0x7851f6fd0520>
test = <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval...dules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)>
example = <doctest.Example object at 0x785042c7efd0>
exc_info = (<class 'ValueError'>, ValueError('\n################################################################################\...##############################################################################'), <traceback object at 0x785043fb5680>)

    def report_unexpected_exception(self, out, test, example, exc_info):
>       raise UnexpectedException(test, example, exc_info)
E       doctest.UnexpectedException: <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval from /tmp/pytest-of-jpodivin/pytest-1/popen-gw0/cache/modules/evaluate_modules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)>

/usr/lib/python3.8/doctest.py:1850: UnexpectedException

During handling of the above exception, another exception occurred:

self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval'
evaluation_module_type = 'metric'

    def test_load(self, evaluation_module_name, evaluation_module_type):
        doctest.ELLIPSIS_MARKER = "[...]"
        evaluation_module = importlib.import_module(
            evaluate.loading.evaluation_module_factory(
                os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type
            ).module_path
        )
        evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__)
        # check parameters
        parameters = inspect.signature(evaluation_instance._compute).parameters
        self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()]))  # no **kwargs
        # run doctest
        with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__):
            with self.use_local_metrics(evaluation_module_type):
                try:
                    results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True)
                except doctest.UnexpectedException as e:
>                   raise e.exc_info[1]  # raise the exception that doctest caught

tests/test_metric_common.py:119: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.8/doctest.py:1336: in __run
    exec(compile(example.source, filename, "single",
<doctest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval[3]>:1: in <module>
    ???
.venv/lib/python3.8/site-packages/evaluate/module.py:467: in compute
    output = self._compute(**inputs, **compute_kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = EvaluationModule(name: "code_eval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='str... predictions=candidates, k=[1, 2])
    >>> print(pass_at_k)
    {'pass@1': 0.5, 'pass@2': 1.0}
""", stored examples: 0)
predictions = [['def add(a,b): return a*b', 'def add(a, b): return a+b']], references = ['assert add(2,3)==5'], k = [1, 2]
num_workers = 4, timeout = 3.0

    def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
        """Returns the scores"""
    
        if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
>           raise ValueError(_WARNING)
E           ValueError: 
E           ################################################################################
E                                             !!!WARNING!!!
E           ################################################################################
E           The "code_eval" metric executes untrusted model-generated code in Python.
E           Although it is highly unlikely that model-generated code will do something
E           overtly malicious in response to this test suite, model-generated code may act
E           destructively due to a lack of model capability or alignment.
E           Users are strongly encouraged to sandbox this evaluation suite so that it
E           does not perform destructive actions on their host or network. For more
E           information on how OpenAI sandboxes its code, see the paper "Evaluating Large
E           Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
E           
E           Once you have read this disclaimer and taken appropriate precautions,
E           set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
E           with:
E           
E           >>> import os
E           >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
E           
E           ################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant