From 7b8d5e1c3f2fab9d2dacd0079085c5a6bee09e4a Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Sat, 25 May 2024 01:17:56 -0500 Subject: [PATCH] Fix invalid regex in unconstrained arrays for json_schema.py --- outlines/fsm/json_schema.py | 23 +++++++++++++------- tests/fsm/test_json_schema.py | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 37f04f610..ac819bc07 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -297,15 +297,22 @@ def to_regex( # Here we need to make the choice to exclude generating list of objects # if the specification of the object is not given, even though a JSON # object that contains an object here would be valid under the specification. - types = [ + legal_types = [ {"type": "boolean"}, {"type": "null"}, {"type": "number"}, {"type": "integer"}, {"type": "string"}, ] - regexes = [to_regex(resolver, t, whitespace_pattern) for t in types] - return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}){allow_empty}{whitespace_pattern}\]" + depth = instance.get("depth", 2) + if depth > 0: + legal_types.append({"type": "object", "depth": depth - 1}) + legal_types.append({"type": "array", "depth": depth - 1}) + + regexes = [ + to_regex(resolver, t, whitespace_pattern) for t in legal_types + ] + return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]" elif instance_type == "object": # pattern for json object with values defined by instance["additionalProperties"] @@ -328,20 +335,20 @@ def to_regex( # unset or True, it is unconstrained object. # We handle this by setting additionalProperties to anyOf: {all types} - legal_values = [ + legal_types = [ {"type": "string"}, {"type": "number"}, {"type": "boolean"}, - {"type": "null"} - # { "type": "array" }, # TODO: enable arrays within object-types + {"type": "null"}, ] # We set the object depth to 2 to keep the expression finite, but the "depth" # key is not a true component of the JSON Schema specification. depth = instance.get("depth", 2) if depth > 0: - legal_values.append({"type": "object", "depth": depth - 1}) - additional_properties = {"anyOf": legal_values} + legal_types.append({"type": "object", "depth": depth - 1}) + legal_types.append({"type": "array", "depth": depth - 1}) + additional_properties = {"anyOf": legal_types} value_pattern = to_regex( resolver, additional_properties, whitespace_pattern diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 2a5a311d2..efc5c054f 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -719,6 +719,46 @@ def test_format(schema, regex, examples): ('{"time":20:20:39Z}', False), # missing quotes for value ], ), + # Unconstrained Object + ( + { + "title": "Foo", + "type": "object", + }, + [ + ("{}", True), + ('{"a": 1, "b": null}', True), + ('{"a": {"z": {"g": 4}}, "b": null}', True), + ("1234", False), # not an object + ('["a", "a"]', False), # not an array + ], + ), + # Unconstrained Array + ( + { + "type": "array", + }, + [ + ("[1, {}, false]", True), + ("[{}]", True), + ('[{"a": {"z": "q"}, "b": null}]', True), + ('[{"a": [1, 2, true], "b": null}]', True), + ('[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2]]]', True), + # too deep, default unconstrained depth limit = 2 + ( + '[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2, [3]]]]', + False, + ), + ('[{"a": {"z": {"g": 4}}, "b": null}]', False), + ("[[[[1]]]]", False), + # not an array + ("{}", False), + ('{"a": 1, "b": null}', False), + ('{"a": {"z": {"g": 4}}, "b": null}', False), + ("1234", False), # not an array + ('{"a": "a"}', False), # not an array + ], + ), ], ) def test_format_without_regex(schema, examples):