From fbee4e2095d64e939c9059be007792e6ea69b72f Mon Sep 17 00:00:00 2001 From: Rowan Walshe Date: Tue, 19 Nov 2024 16:13:26 +0000 Subject: [PATCH] Add initial support for Ada Add pass@1 metric to pass_k.py Update pass_k.py to load the results file from .gz or .json Added basic support for Sets, enabling the translation of mbpp_473_tuple_intersection.py Co-authored-by: Rowan Walshe Co-authored-by: Fabien Chouteau --- dataset_builder/base_language_translator.py | 7 + dataset_builder/generic_translator.py | 2 + dataset_builder/humaneval_to_ada.py | 795 ++++++++++++++++++++ dataset_builder/humaneval_to_elixir.py | 6 + dataset_builder/humaneval_to_lua.py | 6 + dataset_builder/humaneval_to_luau.py | 6 + dataset_builder/humaneval_to_py.py | 6 + dataset_builder/humaneval_to_py_no_types.py | 6 + dataset_builder/humaneval_to_rb.py | 6 + dataset_builder/humaneval_to_swift.py | 6 + dataset_builder/libexperiments.py | 1 + dataset_builder/terms.csv | 1 + evaluation/Dockerfile | 3 + evaluation/src/containerized_eval.py | 2 + evaluation/src/eval_adb.py | 64 ++ evaluation/src/safe_subprocess/__init__.py | 6 +- pass_k.py | 13 +- 17 files changed, 933 insertions(+), 3 deletions(-) create mode 100644 dataset_builder/humaneval_to_ada.py create mode 100644 evaluation/src/eval_adb.py diff --git a/dataset_builder/base_language_translator.py b/dataset_builder/base_language_translator.py index 752a736da3..eb605b397d 100644 --- a/dataset_builder/base_language_translator.py +++ b/dataset_builder/base_language_translator.py @@ -42,6 +42,13 @@ def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: """ pass + @abstractmethod + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + pass + @abstractmethod def gen_call(self, func: TargetExp, args: List[TargetExp]) -> TargetExp: """ diff --git a/dataset_builder/generic_translator.py b/dataset_builder/generic_translator.py index 26e63b5dca..a4d4121e8d 100644 --- a/dataset_builder/generic_translator.py +++ b/dataset_builder/generic_translator.py @@ -27,6 +27,8 @@ def translate_expr(translator, py_expr: ast.AST): return translator.gen_list([translate_expr(translator, e) for e in elts]) case ast.Tuple(elts=elts): return translator.gen_tuple([translate_expr(translator, e) for e in elts]) + case ast.Set(elts=elts): + return translator.gen_set([translate_expr(translator, e) for e in elts]) case ast.Dict(keys=keys, values=values): return translator.gen_dict( [translate_expr(translator, e) for e in keys], diff --git a/dataset_builder/humaneval_to_ada.py b/dataset_builder/humaneval_to_ada.py new file mode 100644 index 0000000000..57229ea8db --- /dev/null +++ b/dataset_builder/humaneval_to_ada.py @@ -0,0 +1,795 @@ +"""This script can be used to translate problems from the HumanEval and +MBPP datasets into Ada 2022. + +There are several limitations of this script, including: +- Ada does not have a tuple type. We've chosen to use a record in these cases, + though it doesn't behave the same as a tuple in Python +- There are a few untyped problems or problems that use a container type but + don't specify the type of the contained elements e.g. `List` or `List[Any]` + vs `List[int]`. We can't easily, and so haven't translated these types +- On that note, it won't translate any problem that uses the type `Any` +- Ada doesn't have a built-in Optional equivalent. You can build your own using + a variant record, which is what we've chosen to do. To attempt to translate + test cases that use Optional, we have used the same approach and workaround + as in humaneval_to_rs.py +- While we have tried to limit the likelihood, it is possible that some of + translations contain invalid types or signatures, making it impossible to + pass those benchmarks + +Note also that these translations won't include examples of a large number of +Ada features which include but are not limited to: +- Subtypes +- Enumerations +- Multi-dimensional array types +- Access types +- Fixed point types +- Limited types +- Generic packages or subprograms +- Private parts +- Tasks +- Contracts +- Much of the standard library +""" + +import ast +import base64 +import re +from typing import List + +from base_language_translator import LanguageTranslator +from humaneval_to_cpp import DOCSTRING_LINESTART_RE + +TargetExp = str + +ADA_KEYWORDS = { + "abort", + "abs", + "abstract", + "accept", + "access", + "aliased", + "all", + "and", + "array", + "at", + "begin", + "body", + "case", + "constant", + "declare", + "delay", + "delta", + "digits", + "do", + "else", + "elsif", + "end", + "entry", + "exception", + "exit", + "for", + "function", + "generic", + "goto", + "if", + "in", + "interface", + "is", + "limited", + "loop", + "mod", + "new", + "not", + "null", + "of", + "or", + "others", + "out", + "overriding", + "package", + "pragma", + "private", + "procedure", + "protected", + "raise", + "range", + "record", + "rem", + "renames", + "requeue", + "return", + "reverse", + "select", + "separate", + "some", + "subtype", + "synchronized", + "tagged", + "task", + "terminate", + "then", + "type", + "until", + "use", + "when", + "while", + "with", + "xor", +} +STANDARD_LIBRARY_TYPES = { + "boolean", + "integer", + "short_short_integer", + "short_integer", + "long_integer", + "long_long_integer", + "short_float", + "float", + "long_float", + "long_long_float", + "string", + "wide_string", + "duration", +} + +# These types might be generated, but are not valid. For now we'll just fail to +# translate prompts that try to generate these types +INVALID_TYPES = [ + "Integer_Array_Array", + "Unbounded_String_Array_Array", + "Integer_Integer_Array_Tuple", +] + +ASCII_CHARACTERS = { + "\x00": "ASCII.NUL", + "\x01": "ASCII.SOH", + "\x02": "ASCII.STX", + "\x03": "ASCII.ETX", + "\x04": "ASCII.EOT", + "\x05": "ASCII.ENQ", + "\x06": "ASCII.ACK", + "\x07": "ASCII.BEL", + "\x08": "ASCII.BS", + "\x09": "ASCII.HT", + "\x0a": "ASCII.LF", + "\x0b": "ASCII.VT", + "\x0c": "ASCII.FF", + "\x0d": "ASCII.CR", + "\x0e": "ASCII.SO", + "\x0f": "ASCII.SI", + "\x10": "ASCII.DLE", + "\x11": "ASCII.DC1", + "\x12": "ASCII.DC2", + "\x13": "ASCII.DC3", + "\x14": "ASCII.DC4", + "\x15": "ASCII.NAK", + "\x16": "ASCII.SYN", + "\x17": "ASCII.ETB", + "\x18": "ASCII.CAN", + "\x19": "ASCII.EM", + "\x1a": "ASCII.SUB", + "\x1b": "ASCII.ESC", + "\x1c": "ASCII.FS", + "\x1d": "ASCII.GS", + "\x1e": "ASCII.RS", + "\x1f": "ASCII.US", + "\x7f": "ASCII.DEL", +} + +CAMEL_REGEX_1 = re.compile("(.)([A-Z][a-z]+)") +CAMEL_REGEX_2 = re.compile("__([A-Z])") +CAMEL_REGEX_3 = re.compile("([a-z0-9])([A-Z])") + + +def camel_to_snake(name: str) -> str: + # Taken from: https://stackoverflow.com/a/1176023 + name = CAMEL_REGEX_1.sub(r"\1_\2", name) + name = CAMEL_REGEX_2.sub(r"_\1", name) + name = CAMEL_REGEX_3.sub(r"\1_\2", name) + return name.lower() + + +def ada_case(name: str) -> str: + return camel_to_snake(name).title() + + +def python_string_to_ada_string(s: str) -> str: + # TODO figure out what to do with UTF-8 🙈 + s = s.replace('"', '""') + for c in ASCII_CHARACTERS: + s.replace(c, f'" & {ASCII_CHARACTERS[c]} & "') + return s + + +def make_valid_ada_name(name: str) -> str: + """Make a valid Ada name from a string. + This is a very simple implementation, and almost certainly not correct for + all cases, but should be sufficient for our purposes. Replaces all non-word + characters with underscores.""" + return re.sub(r"\W", "_", name) + + +def coerce(expr: str, type) -> str: + """Addresses differences in literal syntax due to our selected method of translating types + + Optional: We've used a variant record to represent an optional type. This means we can't just + use the value or None / equivalent. There is also one edge case for HumanEval 136 which is + implemented in a non-generic way, but makes that benchmark valid. + + Strings: If a string is an argument to our candidate function, or the return type, we can use + a regular String. If a string is part of a container like a record or an array for example, + we can't just use the String type. For records we could use a discriminated record to define + the length of the sting. We've however chosen to just use the Unbounded_String type in these + cases. If a string is part of a Dict, we can't use an Unbounded_String as a key, so we've + chosen to use the String type for both the key and value in this case. + """ + + def coerce_to_option(expr: str) -> str: + if expr == "None" or expr == "null": + return "(Valid => False)" + else: + return f"(Valid => True, Value => {make_strings_unbounded(expr)})" + + match expr, type: + case expr, ast.Name(id="str"): + return make_strings_bounded(expr) + case expr, ast.Subscript(ast.Name("Optional"), _): + return coerce_to_option(expr) + case expr, ast.Subscript( + ast.Name("Tuple"), ast.Tuple([_, ast.Constant(value=ast.Ellipsis)], _) + ): + return f"[{expr[1:-1]}]" # Replace parentheses with square brackets + case expr, ast.Subscript( + ast.Name("Tuple"), + ast.Tuple( + [ + ast.Subscript(ast.Name("Optional")), + ast.Subscript(ast.Name("Optional")), + ], + _, + ), + ): + # This is a special case for just one benchmark (HumanEval_136), which + # uses Tuple[Option[int], Option[int]]. There is something more rigorous + # to be done here where we properly coerce things. But I, like the + # implementor of the rust translator, do not want to do it + l, r = expr.strip("()").split(", ") + return f"({coerce_to_option(l)}, {coerce_to_option(r)})" + case expr, ast.Subscript( + ast.Name("Dict"), + slice=ast.Tuple( + elts=[ast.Name("str"), ast.Subscript(ast.Name("Optional"))] + ), + ): + # Workaround for mbpp_465, which has an argument of type Dict[str, Optional[str]] + expr = expr[1:-1] # Remove the surrounding parentheses + kv_pairs = expr.split(", ") + for i in range(len(kv_pairs)): + pair = kv_pairs[i] + k, v = pair.split(" => ") + kv_pairs[i] = f"{k} => {coerce_to_option(v)}" + return f"[{', '.join(kv_pairs)}]" + case _: + return expr + + +def extract_arguments(expr: str) -> List[str]: + """Given a function call extract a list of the top level arguments. e.g.: + - "Candidate (1, 2, 3)" -> ['1', '2', '3'] + - "Candidate ("foo", (1, 2, 3))" -> ['"foo"', '(1, 2, 3)'] + + Assumes that expr has arguments, which is the case for HumanEval and MBPP. + """ + # Remove the function name and parentheses + start = expr.index("(") + 1 + end = expr.rindex(")") + arguments_str = expr[start:end].strip() + + arguments = [] + current_arg = [] + nested_level = 0 + in_string = False + + for char in arguments_str: + if char == "," and not in_string and nested_level == 0: + # When we encounter a comma at the top level, split the argument + arguments.append("".join(current_arg).strip()) + current_arg = [] + else: + current_arg.append(char) + # Handle nested parentheses + if char == '"': + in_string = not in_string + elif not in_string and char in ["(", "["]: + nested_level += 1 + elif not in_string and char in [")", "]"]: + nested_level -= 1 + + # Append the last argument + if current_arg: + arguments.append("".join(current_arg).strip()) + + return arguments + + +BASE64_PATTERN = re.compile( + r"`(?P(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{4}|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}={2}))`" +) + + +def create_b64_encoded_string(value: str) -> str: + """Convert a string to a base64 encoded string""" + utf8_bytes = value.encode("utf-8") + base64_bytes = base64.b64encode(utf8_bytes) + base64_string = base64_bytes.decode("utf-8") + return base64_string + + +def decode_bounded_string(match) -> str: + """Decode a base64 encoded string that is surrounded by quotes to a bounded string e.g.: + - '""' -> '""' + - '"YWJj"' -> '"abc"' + - '"Zm9vIiJiYXI="' => '"foo""bar"' + """ + base64_string = match.group("b64") + utf8_bytes = base64.b64decode(base64_string) + utf8_string = utf8_bytes.decode("utf-8") + return f'"{utf8_string}"' + + +def decode_unbounded_string(match) -> str: + """Decode a base64 encoded string that is surrounded by quotes to an unbounded string e.g.: + - '""' -> 'To_Unbounded_String ("")' + - '"YWJj"' -> 'To_Unbounded_String ("abc")' + - '"Zm9vIiJiYXI="' => 'To_Unbounded_String ("foo""bar")' + """ + utf8_string = decode_bounded_string(match) + return f"To_Unbounded_String ({utf8_string})" + + +def make_strings_bounded(expr: str) -> str: + """Replace all strings in the expr with bounded strings, + decoding the base64 format in the process""" + if expr == "``": + return '""' + return BASE64_PATTERN.sub(decode_bounded_string, expr) + + +def make_strings_unbounded(expr: str) -> str: + """Replace all strings in the expr with unbounded strings, + decoding the base64 format in the process""" + if expr == "``": + return '""' + return BASE64_PATTERN.sub(decode_unbounded_string, expr) + + +SUBP_NAME_PATTERN = re.compile(r"^(?P\S+)\s+\(.*\)$") + + +def get_subp_name(expr: str) -> str: + """Get the subprogram name of the lhs expression. Should be Candidate""" + return SUBP_NAME_PATTERN.match(expr).group("subp") + + +class TranslationDesignError(Exception): + pass + + +class Translator(LanguageTranslator[TargetExp]): + """Translator class for Ada 2022 + + This class can be used to translate problems from the HumanEval and MBPP datasets into Ada 2022. + Note that this translator was crafted with the HumanEval and MBPP datasets in mind, and you may + encounter issues with translating prompts from other datasets, even if they appear to use the + same types as those supported in the HumanEval and MBPP datasets. + + Types will generally be translated as follows: + - int -> Integer + - float -> Float + - bool -> Boolean + - str + - String + - If the string is an argument to the candidate function or the return type + - If the string is a key or value in a Dict + - Unbounded_String - Otherwise + - Optional -> Ada doesn't have a built in Optional type. We've implemented this as a variant record + - None -> This is generally only used as a test value for Optional types. Assuming this is the + case, we've translated this as (Valid => False), to match our implementation of Optional + - List + - Vector - If the list is an element of another container + - Array - Otherwise + - Tuple + - Array / Vector - Translated as if it was a list, if it variable length e.g. Tuple[str, ...] in HumanEval_148 + - Record - Otherwise + - Dict -> Indefinite_Ordered_Map + - Set -> Indefinite_Ordered_Set + """ + + def __init__(self) -> None: + super().__init__() + self.reinit() + self.float_type = "Float" + self.int_type = "Integer" + self.bool_type = "Boolean" + self.array_type = "Array" + self.indent = " " * 3 + self._custom_type_decls = [] + self._use_statements = set() + + def reinit(self) -> None: + self.subprogram_name = None + self._custom_type_decls = [] + self._use_statements = set() + self._imports = set() + + def gen_set_type(self, elem_type): + # Probably won't work complex examples, but there is only one "valid" problem that uses set in MBPP and HumanEval + element = self.translate_pytype(elem_type) + if element == "Integer_Integer_Tuple": + # Workaround for MBPP_473, as we need a < operator for type to be able to use an ordered set + self._custom_type_decls.append( + 'function "<" (Left, Right : Integer_Integer_Tuple) return Boolean is\n (Left.Integer_1 < Right.Integer_1 or else (Left.Integer_1 = Right.Integer_1 and then Left.Integer_2 < Right.Integer_2));' + ) + type_name = make_valid_ada_name(f"{element}_Sets") + self._imports.add("with Ada.Containers.Indefinite_Ordered_Sets;") + decl = f"package {type_name} is new Ada.Containers.Indefinite_Ordered_Sets (Element_Type => {element});\n use {type_name};" + self._custom_type_decls.append(decl) + self._use_statements.add(f"use {type_name};") + return f"{type_name}.Set" + + def gen_array_type(self, elem_type): + # TODO handle cases where element isn't a fixed size e.g. strings, unconstrained arrays etc. + element = self.translate_pytype(elem_type) + type_name = make_valid_ada_name(f"{element}_Array") + decl = f"type {type_name} is array (Positive range <>) of {element};" + self._custom_type_decls.append(decl) + return type_name + + def gen_vector_type(self, elem_type): + element = self.translate_pytype(elem_type) + type_name = make_valid_ada_name(f"{element}_Vector") + self._imports.add("with Ada.Containers.Vectors;") + decl = f"package {type_name} is new Ada.Containers.Vectors (Index_Type => Positive, Element_Type => {element});\n use {type_name};" + self._custom_type_decls.append(decl) + self._use_statements.add(f"use {type_name};") + return f"{type_name}.Vector" + + def gen_optional_type(self, elem_type): + element = self.translate_pytype(elem_type) + type_name = f"{element}_Option" + decl = f"type {type_name} (Valid : Boolean := False) is record\n case Valid is\n when True =>\n Value : {element};\n when False =>\n null;\n end case;\nend record;" + self._custom_type_decls.append(decl) + return type_name + + def gen_tuple_type(self, elts): + element_types = [self.translate_pytype(elem) for elem in elts] + + type_name = make_valid_ada_name("_".join(element_types) + "_Tuple") + decl = f"type {type_name} is record\n" + count = 1 + for elt in element_types: + decl += f" {make_valid_ada_name(elt)}_{count} : {elt};\n" + count += 1 + decl += " end record;\n" + self._custom_type_decls.append(decl) + return type_name + + def gen_dict_type(self, key_type, value_type): + # We can't use an Unordered_String as a key in a Map + key = self.translate_pytype(key_type, True) + value = self.translate_pytype(value_type, True) + type_name = make_valid_ada_name(f"{key}_{value}_Dict") + self._imports.add("with Ada.Containers.Indefinite_Ordered_Maps;") + decl = f"package {type_name} is new Ada.Containers.Indefinite_Ordered_Maps (Key_Type => {key}, Element_Type => {value});\n use {type_name};" + self._custom_type_decls.append(decl) + self._use_statements.add(f"use {type_name};") + return f"{type_name}.Map" + + def translate_pytype(self, ann: ast.expr | None, top_level: bool = False) -> str: + """Traverses an AST annotation and translate Python type annotation to an Ada Type""" + + if ann is None: + raise Exception("No annotation") + + # Todo add missing Set type + match ann: + case ast.Name(id="str"): + if top_level: + return "String" + self._imports.add( + "with Ada.Strings.Unbounded; use Ada.Strings.Unbounded;" + ) + return "Unbounded_String" + case ast.Name(id="int"): + return self.int_type + case ast.Name(id="float"): + return self.float_type + case ast.Name(id="bool"): + return self.bool_type + case ast.Name(id="None"): + # It appears None is always used in optional + raise Exception("None type not implemented") + case ast.Name(id="Set"): + raise Exception("Set without defined element type not implemented") + case ast.List: + raise Exception("List without defined element type not implemented") + case ast.Tuple: + raise Exception("Tuple not implemented") + case ast.Dict: + raise Exception( + "Dict without defined key and value types not implemented" + ) + case ast.Subscript( + value=ast.Name(id="Dict"), slice=ast.Tuple(elts=key_val_type) + ): + return self.gen_dict_type(key_val_type[0], key_val_type[1]) + case ast.Subscript(value=ast.Name(id="List"), slice=elem_type): + if top_level: + return self.gen_array_type(elem_type) + return self.gen_vector_type(elem_type) + case ast.Subscript( + value=ast.Name(id="Tuple"), + slice=ast.Tuple([elem_type, ast.Constant(value=Ellipsis)], _), + ): + # Special case for when we have a variable length tuple with a typehint like Tuple[int, ...] e.g. HumanEval_148 + if top_level: + return self.gen_array_type(elem_type) + return self.gen_vector_type(elem_type) + case ast.Subscript(value=ast.Name(id="Tuple"), slice=ast.Tuple(elts=elems)): + return self.gen_tuple_type(elems) + case ast.Subscript(value=ast.Name(id="Optional"), slice=elem_type): + return self.gen_optional_type(elem_type) + case ast.Subscript(value=ast.Name(id="Union"), slice=ast.Tuple(elts=elems)): + raise Exception("Union Not implemented") + case ast.Subscript(value=ast.Name(id="Set"), slice=elem_type): + return self.gen_set_type(elem_type) + case ast.Name(id="Any"): + raise Exception("Any type not implemented") + case ast.Constant(value=None): + raise Exception("None constant type not implemented") + case ast.Constant(value=ast.Ellipsis): + raise Exception( + "Ellipsis constant type not implemented, other than the tuple workaround" + ) + case _: + print(f"Unhandled annotation: {ast.dump(ann)}") + raise Exception(f"Unhandled annotation: {ann}") + + def gen_literal(self, c: bool | str | int | float | None) -> TargetExp: + """ + Translate a literal expression + c: is the literal value + """ + match c: + case bool() | int() | float(): + return str(c) + case str(): + """We don't know at this point if the string can be bounded, or must be unbounded. + Instead we'll just output a string and then later during the call to `finalize` + we'll create bounded or unbounded strings where needed. + + By formatting the string using b64, and surrounding it by backticks, which aren't + used in ada, it makes it easier to identify which strings have or haven't been + converted back to strings during the call to `finalize` + """ + string = python_string_to_ada_string(c) + return f"`{create_b64_encoded_string(string)}`" + case None: + return "null" + case _: + raise TranslationDesignError(f"Unhandled expression: {c}") + + def gen_var(self, v: str) -> TargetExp: + """ + Translate a variable with name v. + """ + v = v.lower() # Ada is case insensitive + + # We don't have to rename variables who's names clash with types from the standard library + # But doing some will make more normal subprogram specifications + if v in ADA_KEYWORDS or v in STANDARD_LIBRARY_TYPES: + return ada_case(f"my_{v}") + return ada_case(v) + + def gen_list(self, l: List[TargetExp]) -> TargetExp: + """ + Translate a list with elements l + """ + return "[" + ", ".join([make_strings_unbounded(i) for i in l]) + "]" + + def gen_tuple(self, t: List[TargetExp]) -> TargetExp: + """ + Translate a tuple with elements t + """ + return "(" + ", ".join([make_strings_unbounded(i) for i in t]) + ")" + + def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: + """ + Translate a dictionary with keys and values + """ + return ( + "[" + + ", ".join( + [ + f"{make_strings_bounded(k)} => {make_strings_bounded(v)}" + for k, v in zip(keys, values) + ] + ) + + "]" + ) + + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + return "[" + ", ".join([make_strings_unbounded(i) for i in s]) + "]" + + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> TargetExp: + """ + Translate a function call `func(args)` + """ + return f"{func} ({', '.join(args)})" + + def package_imports(self) -> str: + # TODO handle cases where more imports are needed e.g. vector/hashmap + return "\n".join(["pragma Ada_2022;", *self._imports]) + "\n" + + def translate_prompt( + self, name: str, args: List[ast.arg], returns: ast.expr, description: str + ) -> str: + """ + Translate Python prompt. + """ + self.reinit() + self.type = [[arg.annotation for arg in args], returns] + + comment_start = self.indent + "-- " + ada_description = ( + comment_start + + DOCSTRING_LINESTART_RE.sub("\n" + comment_start, description.strip()) + + "\n" + ) + self.subprogram_name = ada_case(name) + self.subprogram_type = "function" # Will always use function as all subprograms in MBPP and HumanEval have return types + self.args_type = [self.translate_pytype(arg.annotation, True) for arg in args] + formal_args = [ + f"{self.gen_var(arg.arg)} : {self.translate_pytype(arg.annotation, True)}" + for arg in args + ] + formal_arg_list = "; ".join(formal_args) + self.return_type = self.translate_pytype(returns, True) + subprogram_signature = ( + f"{self.subprogram_type} {self.subprogram_name} ({formal_arg_list})" + ) + self.candidate_signature = ( + f"{self.subprogram_type} Candidate ({formal_arg_list})" + ) + if self.subprogram_type == "function": + subprogram_signature = f"{subprogram_signature} return {self.return_type}" + self.candidate_signature = ( + f"{self.candidate_signature} return {self.return_type}" + ) + + # To be able to use custom types such as arrays of integers, the prompt + # starts with the specification of a "Placeholder" pacakge where we + # declare these types. Then comes the declaration the sub-program to be + # competed, and finally the beginning of "Placeholder" package body. + # + # Later in the testsuite prefix/suffix, we add a Main procedure to the + # output. + # + # The result should be an Ada file that contains both the specification + # and body of a "Placeholder" package, and a main procedure. This will + # will be split in several .ads and .adb files using gnatchop in + # evaluation phase. + + for custom_type in set(self._custom_type_decls): + for invalid_type in INVALID_TYPES: + if invalid_type in custom_type: + raise TranslationDesignError( + f'Tried to generate invalid type: "{custom_type}"' + ) + + ada_spec = f"{self.package_imports()}\n" + ada_spec += "package Placeholder is\n" + seen_types = set() + for custom_type in self._custom_type_decls: + if custom_type not in seen_types: + seen_types.add(custom_type) + ada_spec += f"{self.indent}{custom_type}\n" + ada_spec += f"{self.indent}{subprogram_signature};\n{ada_description}\n" + ada_spec += "end Placeholder;\n\n" + + ada_body = f"{self.package_imports()}\n" + ada_body += "package body Placeholder is\n" + ada_body += f"{self.indent}{subprogram_signature}" + + ada_prompt = ada_spec + ada_body + return ada_prompt + + def test_suite_prefix_lines(self, entry_point: str) -> List[str]: + """ + This code goes at the start of the test suite. + The entry_point is ??? + """ + return [ + "", + f"{self.indent}end {self.subprogram_name};", + "", + "end Placeholder;", + "", + self.package_imports().strip(), + "with Placeholder; use Placeholder;", + "", + "procedure Main is", + "", + *[f"{self.indent}{use}" for use in self._use_statements], + "", + f"{self.indent}{self.candidate_signature} renames Placeholder.{self.subprogram_name};", + "", + "begin", + ] + + def test_suite_suffix_lines(self) -> List[str]: + """ + This code goes at the end of the test suite. + """ + return ["end Main;"] + + def deep_equality(self, left: TargetExp, right: TargetExp) -> str: + """ + All tests are assertions that compare deep equality between left and right. + """ + return f"{self.indent}pragma Assert ({left} = {right});" + + def file_ext(self) -> str: + """ + The file extension for this language + """ + return "adb" + + def stop(self) -> List[str]: + """ + The list of stop tokens for this language + """ + if self.subprogram_name is None: + raise TranslationDesignError("subprogram_name should never be None") + return [f"\n{self.indent}end "] + + def no_completion_prompt_stub(self) -> str: + """ + A default stub to create a syntactically valid translation in case of not performing completion. + For example, for Rust this could be: + + todo!() + } + + """ + if self.subprogram_name is None: + raise TranslationDesignError("subprogram_name should never be None") + return f'raise Program_Error with "Not implemented";\n end {self.subprogram_name};' + + def create_strings_in_lhs(self, lhs_expr: str) -> str: + """This is used to properly format strings in the lhs of a test case. + + Extract all of the top level arguments of the lhs expression. For each + top level argument: + - If it is a string, convert it to a bounded string + - Otherwise extract all strings and convert them to unbounded strings + Then rebuild the lhs function call. + """ + subp_name = get_subp_name(lhs_expr) + args = extract_arguments(lhs_expr) + assert len(args) == len(self.type[0]) + args = [coerce(x, y) for x, y in zip(args, self.type[0])] + return f"{subp_name} ({', '.join(args)})" + + def finalize(self, result, context) -> str: + match context: + case "lhs": + # return result + return self.create_strings_in_lhs(result) + case "rhs": + return coerce(result, self.type[1]) + case _: + raise Exception("bad context to finalize") diff --git a/dataset_builder/humaneval_to_elixir.py b/dataset_builder/humaneval_to_elixir.py index 3e7cdb2ee6..34a1af25e4 100644 --- a/dataset_builder/humaneval_to_elixir.py +++ b/dataset_builder/humaneval_to_elixir.py @@ -99,6 +99,12 @@ def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: """ return "%{" + ", ".join(f"{k} => {v}" for k, v in zip(keys, values)) + "}" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> str: """Translate a function call `func(args)` A function call f(x, y, z) translates to f(x, y, z) diff --git a/dataset_builder/humaneval_to_lua.py b/dataset_builder/humaneval_to_lua.py index 8d1707e7cc..1712573231 100644 --- a/dataset_builder/humaneval_to_lua.py +++ b/dataset_builder/humaneval_to_lua.py @@ -75,6 +75,12 @@ def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: """ return "{" + ", ".join(f"[{k}] = {v}" for k, v in zip(keys, values)) + "}" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> TargetExp: """Translate a function call `func(args)` A function call f(x, y, z) translates to f(x, y, z) diff --git a/dataset_builder/humaneval_to_luau.py b/dataset_builder/humaneval_to_luau.py index ac38be9505..dee6fe43fb 100644 --- a/dataset_builder/humaneval_to_luau.py +++ b/dataset_builder/humaneval_to_luau.py @@ -224,6 +224,12 @@ def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: """ return "{" + ", ".join(f"[{k}] = {v}" for k, v in zip(keys, values)) + "}" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> TargetExp: """Translate a function call `func(args)` A function call f(x, y, z) translates to f(x, y, z) diff --git a/dataset_builder/humaneval_to_py.py b/dataset_builder/humaneval_to_py.py index 260538d907..c70af5b150 100644 --- a/dataset_builder/humaneval_to_py.py +++ b/dataset_builder/humaneval_to_py.py @@ -96,6 +96,12 @@ def gen_tuple(self, t: List[str]) -> str: def gen_dict(self, keys: List[str], values: List[str]) -> str: return "{ " + ", ".join(f'{k}: {v}' for k, v in zip(keys, values)) + " }" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: str, args: List[str]) -> str: return func + "(" + ", ".join(args) + ")" diff --git a/dataset_builder/humaneval_to_py_no_types.py b/dataset_builder/humaneval_to_py_no_types.py index a1695a5f11..444a591819 100644 --- a/dataset_builder/humaneval_to_py_no_types.py +++ b/dataset_builder/humaneval_to_py_no_types.py @@ -61,6 +61,12 @@ def gen_tuple(self, t: List[str]) -> str: def gen_dict(self, keys: List[str], values: List[str]) -> str: return "{ " + ", ".join(f'{k}: {v}' for k, v in zip(keys, values)) + " }" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: str, args: List[str]) -> str: return func + "(" + ", ".join(args) + ")" diff --git a/dataset_builder/humaneval_to_rb.py b/dataset_builder/humaneval_to_rb.py index f14b86a159..f8d12f832f 100644 --- a/dataset_builder/humaneval_to_rb.py +++ b/dataset_builder/humaneval_to_rb.py @@ -101,6 +101,12 @@ def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: """ return "{" + ", ".join(f"{k} => {v}" for k, v in zip(keys, values)) + "}" + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> str: """Translate a function call `func(args)` A function call f(x, y, z) translates to f(x, y, z) diff --git a/dataset_builder/humaneval_to_swift.py b/dataset_builder/humaneval_to_swift.py index b25ebd18c3..dfefb23a9c 100644 --- a/dataset_builder/humaneval_to_swift.py +++ b/dataset_builder/humaneval_to_swift.py @@ -550,6 +550,12 @@ def gen_tuple(self, t: List[TargetExp]) -> TargetExp: def gen_dict(self, keys: List[TargetExp], values: List[TargetExp]) -> TargetExp: return ast.Dict(keys, values) + def gen_set(self, s: List[TargetExp]) -> TargetExp: + """ + Translate a set with elements s + """ + raise NotImplementedError("This translator does not currently support translating sets") + def gen_call(self, func: TargetExp, args: List[TargetExp]) -> TargetExp: return ast.Call(func, args) diff --git a/dataset_builder/libexperiments.py b/dataset_builder/libexperiments.py index 6f8a090036..c896e6024b 100644 --- a/dataset_builder/libexperiments.py +++ b/dataset_builder/libexperiments.py @@ -41,6 +41,7 @@ def path(self) -> Path: "hs", "elixir", "clj", + "ada", ] MODELS = ["davinci", "incoder", "codegen"] diff --git a/dataset_builder/terms.csv b/dataset_builder/terms.csv index 2e7f9cb845..8b089d40fe 100644 --- a/dataset_builder/terms.csv +++ b/dataset_builder/terms.csv @@ -1,4 +1,5 @@ Python,py,array,list,tuple,dictionary,None,True,False +Ada,adb,array,Vector,record,Map,null,True,False Bash,sh,array,list,list,CSV,None,true,false C#,cs,list,list,tuple,dictionary,null,true,false C++,cpp,vector,vector,tuple,map,None,true,false diff --git a/evaluation/Dockerfile b/evaluation/Dockerfile index 0ecb5df5e7..a270e787f1 100644 --- a/evaluation/Dockerfile +++ b/evaluation/Dockerfile @@ -22,6 +22,9 @@ RUN apt-get update -yqq && apt-get install -yqq \ RUN apt-get install -yqq libtest-deep-perl RUN apt-get install -yqq wget +# Ada +RUN apt-get install -yqq gnat-12 + # JS/TS RUN curl -fsSL https://deb.nodesource.com/setup_current.x | bash - RUN apt-get install -y nodejs diff --git a/evaluation/src/containerized_eval.py b/evaluation/src/containerized_eval.py index 2b82af80eb..533fb68274 100644 --- a/evaluation/src/containerized_eval.py +++ b/evaluation/src/containerized_eval.py @@ -4,6 +4,7 @@ """ from pathlib import Path +import eval_adb import eval_ruby import eval_lua import eval_python @@ -32,6 +33,7 @@ EVALUATORS = { + "ada": (eval_adb.eval_script, ".adb"), "rb": (eval_ruby.eval_script, ".rb"), "lua": (eval_lua.eval_script, ".lua"), "python": (eval_python.eval_script, ".py"), diff --git a/evaluation/src/eval_adb.py b/evaluation/src/eval_adb.py new file mode 100644 index 0000000000..5f177230aa --- /dev/null +++ b/evaluation/src/eval_adb.py @@ -0,0 +1,64 @@ +from pathlib import Path +from safe_subprocess import run +from generic_eval import main + + +LANG_NAME = "Ada" +LANG_EXT = ".adb" + + +def eval_script(path: Path): + working_dir: Path = path.parent / (path.stem + "_tmp") + working_dir.mkdir() + chop_result = run(["gnatchop", "-w", path, working_dir]) + if chop_result.exit_code != 0: + return { + "status": "SyntaxError (gnatchop)", + "exit_code": chop_result.exit_code, + "stdout": chop_result.stdout, + "stderr": chop_result.stderr, + } + + build_result = run( + [ + "gnatmake", + "-gnatW8", + "main.adb", + "-o", + "main", + "-g", + "-j0", + "-gnata", + "-gnat2022", + "-gnateE", + "-bargs", + "-Es", + ], + cwd=str(working_dir), + ) + if build_result.exit_code != 0: + return { + "status": "SyntaxError (gnatmake)", + "exit_code": build_result.exit_code, + "stdout": build_result.stdout, + "stderr": build_result.stderr, + } + + status = "OK" + run_result = run(["./main"], cwd=str(working_dir)) + + if run_result.timeout: + status = "Timeout" + elif run_result.exit_code != 0: + status = "Exception" + + return { + "status": status, + "exit_code": run_result.exit_code, + "stdout": run_result.stdout, + "stderr": run_result.stderr, + } + + +if __name__ == "__main__": + main(eval_script, LANG_NAME, LANG_EXT) diff --git a/evaluation/src/safe_subprocess/__init__.py b/evaluation/src/safe_subprocess/__init__.py index 15b7308a82..aa639c1441 100644 --- a/evaluation/src/safe_subprocess/__init__.py +++ b/evaluation/src/safe_subprocess/__init__.py @@ -29,8 +29,11 @@ def set_nonblocking(reader): def run( - args: List[str], timeout_seconds: int = 15, max_output_size: int = 2048, + args: List[str], + timeout_seconds: int = 15, + max_output_size: int = 2048, env = None, + cwd: str | None = None ) -> Result: """ Runs the given program with arguments. After the timeout elapses, kills the process @@ -45,6 +48,7 @@ def run( stderr=subprocess.PIPE, start_new_session=True, bufsize=MAX_BYTES_PER_READ, + cwd=cwd ) set_nonblocking(p.stdout) set_nonblocking(p.stderr) diff --git a/pass_k.py b/pass_k.py index aacfd9ce10..c63e74819b 100644 --- a/pass_k.py +++ b/pass_k.py @@ -21,6 +21,7 @@ from pathlib import Path import itertools import argparse +import json from multipl_e.util import gunzip_json, eprint @@ -33,8 +34,13 @@ def estimator(n: int, c: int, k: int) -> float: return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) -def for_file(path): - data = gunzip_json(path) +def for_file(path: Path): + if path.suffix == ".gz": + data = gunzip_json(path) + else: + with open(path, 'r') as f: + data = json.load(f) + if data is None: return None n = len(data["results"]) @@ -75,8 +81,11 @@ def main(): min_completions = np.min([r["n"] for r in results]) max_completions = np.max([r["n"] for r in results]) if temperature == 0.8: + pass_1 = np.mean([r["pass@1"] for r in results]) pass_10 = np.mean([r["pass@10"] for r in results]) pass_100 = np.mean([r["pass@100"] for r in results]) + print( + f"{name},1,{pass_1},{num_problems},{min_completions},{max_completions}") print( f"{name},10,{pass_10},{num_problems},{min_completions},{max_completions}") print(