diff --git a/.github/workflows/run-ut-on-pr-py.yml b/.github/workflows/run-ut-on-pr-py.yml index 30a27c3b..58f39b70 100644 --- a/.github/workflows/run-ut-on-pr-py.yml +++ b/.github/workflows/run-ut-on-pr-py.yml @@ -17,7 +17,7 @@ env: PY310_VERSION: 3.10.12 jobs: pr_run_test: - runs-on: [self-hosted, Linux] + runs-on: [self-hosted, Linux, run] timeout-minutes: 20 steps: - name: Checkout code diff --git a/ais_bench/benchmark/datasets/mbpp.py b/ais_bench/benchmark/datasets/mbpp.py index 17402826..7d802505 100644 --- a/ais_bench/benchmark/datasets/mbpp.py +++ b/ais_bench/benchmark/datasets/mbpp.py @@ -225,7 +225,7 @@ def __init__(self, metric: str = 'MBPP') -> None: DSET_CODES.INVALID_MBPP_METRIC, f"MBPP evaluator metric must be 'MBPP' or 'MBPPPlus', got '{self.metric}'" ) - super.__init__() + super().__init__() def score(self, predictions, references): if len(predictions) != len(references): @@ -397,13 +397,13 @@ def _execution(programs, timeout): exec(programs, exec_globals) key.append('pass') except TimeOutException: - logger.debug(f"Program execution timeout for index {index}") + logger.debug(f"Program execution timeout for task_id {task_id}") key.append('timeout') except AssertionError as e: - logger.debug(f"Program assertion failed for index {index}: {e}") + logger.debug(f"Program assertion failed for task_id {task_id}: {e}") key.append('wrong_answer') except BaseException as e: - logger.debug(f"Program execution failed for index {index}: {e}") + logger.debug(f"Program execution failed for task_id {task_id}: {e}") key.append('failed') manager = multiprocessing.Manager() @@ -428,10 +428,11 @@ class MBPPPassKEvaluator(MBPPEvaluator): k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100) """ - def __init__(self, k=(1, 10, 100)) -> None: + def __init__(self, k=(1, 10, 100), metric: str = 'MBPP') -> None: if not isinstance(k, Sequence): k = (k, ) self.k = k + super().__init__(metric=metric) @staticmethod def estimate_pass_at_k(