diff --git a/recipes/sky-t1-preview/preprocess.py b/recipes/sky-t1-preview/preprocess.py index a096f4d..bb35182 100644 --- a/recipes/sky-t1-preview/preprocess.py +++ b/recipes/sky-t1-preview/preprocess.py @@ -1,5 +1,8 @@ import json +import pyarrow as pa +from ray.data import Schema + class APPSPreprocessor: WITH_FN_NAME_TEMPLATE = "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}" # noqa: E501 @@ -68,3 +71,17 @@ def __call__(self, row): prompt = row["problem"] _input = self.TEMPLATE.format(prompt=prompt) return {**row, "user_input": _input} + + +def taco_coerce_types(row, schema: Schema): + for key, schema_type in zip(schema.names, schema.types): + value = pa.array([row[key]]) + if value.type != schema_type: + if schema_type == pa.string(): + try: + row[key] = str(row[key]) + except Exception: + row[key] = "" + elif schema_type == pa.null(): + row[key] = None + return row diff --git a/recipes/sky-t1-preview/recipe.py b/recipes/sky-t1-preview/recipe.py index 7a3bac9..8e42fdf 100644 --- a/recipes/sky-t1-preview/recipe.py +++ b/recipes/sky-t1-preview/recipe.py @@ -18,7 +18,12 @@ from skythought.evals.scoring.taco import TACOScorer from .postprocess import convert_to_sharegpt_format -from .preprocess import APPSPreprocessor, NUMINAPreprocessor, TACOPreprocessor +from .preprocess import ( + APPSPreprocessor, + NUMINAPreprocessor, + TACOPreprocessor, + taco_coerce_types, +) from .prompts import CONVERT_PROMPT, CONVERT_PROMPT_EXAMPLE parser = argparse.ArgumentParser() @@ -38,6 +43,9 @@ # convert all to ray dataset apps_ds = ray.data.from_huggingface(apps_ds) taco_ds_medium = ray.data.from_huggingface(taco_ds_medium) +taco_ds_medium = taco_ds_medium.map( + taco_coerce_types, fn_args=(taco_ds_medium.schema(),) +) numina_ds = ray.data.from_huggingface(numina_ds) @@ -77,7 +85,7 @@ ) scorers = [ APPSScorer(response_column="formatted_response"), - TACOScorer(response_column="formatted_response"), + TACOScorer(response_column="formatted_response", backend="ray"), numina_scorer, numina_scorer, numina_scorer, @@ -168,4 +176,6 @@ # 6. Save datasets dir_name = f"sky-t1-preview-{i}_parquet" + datasets[i] = datasets[i].materialize() + # breakpoint() datasets[i].write_parquet(os.path.abspath(dir_name)) diff --git a/skythought/evals/scoring/apps/apps.py b/skythought/evals/scoring/apps/apps.py index f70c03d..ab6b855 100644 --- a/skythought/evals/scoring/apps/apps.py +++ b/skythought/evals/scoring/apps/apps.py @@ -1,6 +1,6 @@ import copy import json -from typing import Any, Dict +from typing import Any, Dict, Literal import numpy as np import ray @@ -19,11 +19,13 @@ def __init__( response_column="response", answer_column="solutions", input_column="input_output", + backend: Literal["mp", "ray"] = "ray", ) -> None: super().__init__() self.response_column = response_column self.answer_column = answer_column self.input_column = input_column + self.backend = backend def score(self, row: Dict[str, Any]): TIMEOUT = 10