Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support required keyword in JSON Schema #495

Merged
merged 2 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions outlines/fsm/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,10 @@ def to_regex(resolver: Resolver, instance: dict):
----
Many features of JSON schema are missing:
- Support the fact that fields in an object are optional by default
- Handle `required` keyword
- Handle `additionalProperties` keyword
- Handle types defined as a list
- Handle constraints on numbers
- Handle special patterns: `date`, `uri`, etc.
- Handle optional fields (not in `required`)

This does not support recursive definitions.

Expand All @@ -102,13 +100,43 @@ def to_regex(resolver: Resolver, instance: dict):
if "properties" in instance:
regex = ""
regex += r"\{"
for i, (name, value) in enumerate(instance["properties"].items()):
regex += f'{whitespace}"{name}"{whitespace}:{whitespace}'
regex += to_regex(resolver, value)

# No comma after the last key-value pair in JSON
if i < len(instance["properties"]) - 1:
regex += f"{whitespace},"
properties = instance["properties"]
required_properties = instance.get("required", [])
is_required = [item in required_properties for item in properties]
# If at least one property is required, we include the one in the lastest position
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this changing the order of the fields as they were specified in the JSON schema?

Copy link
Member

@rlouf rlouf Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# We need to make sure that the last item generated does not end with a comma and thus end up with invalid JSON.

# When at least one property is required, we select the last in the list and include it at the last position without a comma. We then prepend every prior (optional or required) property with a comma.

Copy link
Member

@rlouf rlouf Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not do the same as below, actually? This may simplify the code by considering both cases at once?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not changing the order of the fields.
An issue with When at least one property is required, we select the last in the list and include it at the last position without a comma is that it would change the order of the fields if the last property is an optional one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't see a way of easily combining both as the case in which there are no required properties creates as many subregexes as there are properties and connect them with | while the case with at least one required property does not need that.

# without any comma.
# For each property before it (optional or required), we add with a comma after the property.
# For each property after it (optional), we add with a comma before the property.
if any(is_required):
last_required_pos = max([i for i, value in enumerate(is_required) if value])
for i, (name, value) in enumerate(properties.items()):
subregex = f'{whitespace}"{name}"{whitespace}:{whitespace}'
subregex += to_regex(resolver, value)
if i < last_required_pos:
subregex = f"{subregex}{whitespace},"
elif i > last_required_pos:
subregex = f"{whitespace},{subregex}"
regex += subregex if is_required[i] else f"({subregex})?"
# If no property is required, we have to create a possible pattern for each property in which
# it's the last one necessarilly present. Then, we add the others as optional before and after
# following the same strategy as described above.
# The whole block is made optional to allow the case in which no property is returned.
else:
property_subregexes = []
for i, (name, value) in enumerate(properties.items()):
subregex = f'{whitespace}"{name}"{whitespace}:{whitespace}'
subregex += to_regex(resolver, value)
property_subregexes.append(subregex)
possible_patterns = []
for i in range(len(property_subregexes)):
pattern = ""
for subregex in property_subregexes[:i]:
pattern += f"({subregex}{whitespace},)?"
pattern += property_subregexes[i]
for subregex in property_subregexes[i + 1 :]:
pattern += f"({whitespace},{subregex})?"
possible_patterns.append(pattern)
regex += f"({'|'.join(possible_patterns)})?"

regex += f"{whitespace}" + r"\}"

Expand Down
98 changes: 96 additions & 2 deletions tests/fsm/test_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def test_match_number(pattern, does_match):
"title": "Foo",
"type": "object",
"properties": {"count": {"title": "Count", "type": "integer"}},
"required": ["count"],
},
'\\{[\\n ]*"count"[\\n ]*:[\\n ]*(0|[1-9][0-9]*)[\\n ]*\\}',
[('{\n "count": 100\n}', True)],
Expand Down Expand Up @@ -262,8 +263,10 @@ def test_match_number(pattern, does_match):
"title": "Foo",
"type": "object",
"properties": {"spam": {"title": "Spam", "type": "integer"}},
"required": ["spam"],
}
},
"required": ["fuzz"],
},
f'\\{{[\\n ]*"fuzz"[\\n ]*:[\\n ]*\\{{[\\n ]*"spam"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*\\}}[\\n ]*\\}}',
[('{\n "fuzz": {\n "spam": 100\n }\n}', True)],
Expand All @@ -278,7 +281,7 @@ def test_match_number(pattern, does_match):
"name": {"title": "Name", "type": "string"},
"a": {"$ref": "#/properties/name"},
},
"required": ["user_id", "name"],
"required": ["user_id", "name", "a"],
},
f'\\{{[\\n ]*"user_id"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*,[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"a"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}',
[('{"user_id": 100, "name": "John", "a": "Marc"}', True)],
Expand All @@ -293,7 +296,7 @@ def test_match_number(pattern, does_match):
"name": {"title": "Name", "type": "string"},
"name2": {"$ref": "#/$defs/name"},
},
"required": ["user_id", "name"],
"required": ["user_id", "name", "name2"],
},
f'\\{{[\\n ]*"user_id"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*,[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,[\\n ]*"name2"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}',
[('{"user_id": 100, "name": "John", "name2": "Marc"}', True)],
Expand All @@ -310,8 +313,10 @@ def test_match_number(pattern, does_match):
"address": {"$ref": "customer#/$defs/address"},
},
"required": [
"name",
"first_name",
"last_name",
"address",
"shipping_address",
"billing_address",
],
Expand Down Expand Up @@ -343,6 +348,95 @@ def test_match_number(pattern, does_match):
)
],
),
# Optional properties
# Last required property in first position
(
{
"properties": {
"name": {"type": "string"},
"age": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
"weapon": {"anyOf": [{"type": "string"}, {"type": "null"}]},
},
"required": ["name"],
"title": "Character",
"type": "object",
},
f'\\{{[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}([\\n ]*,[\\n ]*"age"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?([\\n ]*,[\\n ]*"weapon"[\\n ]*:[\\n ]*(({STRING})|(null)|({STRING}null)|(null{STRING})))?[\\n ]*\\}}',
[
('{ "name" : "Player" }', True),
('{ "name" : "Player", "weapon" : "sword" }', True),
('{ "age" : 10, "weapon" : "sword" }', False),
],
),
# Last required property in middle position
(
{
"properties": {
"name": {"type": "string"},
"age": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
"weapon": {"type": "string"},
"strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
},
"required": ["name", "weapon"],
"title": "Character",
"type": "object",
},
f'\\{{[\\n ]*"name"[\\n ]*:[\\n ]*{STRING}[\\n ]*,([\\n ]*"age"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER}))[\\n ]*,)?[\\n ]*"weapon"[\\n ]*:[\\n ]*{STRING}([\\n ]*,[\\n ]*"strength"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?[\\n ]*\\}}',
[
('{ "name" : "Player" , "weapon" : "sword" }', True),
(
'{ "name" : "Player", "age" : 10, "weapon" : "sword" , "strength" : 10 }',
True,
),
('{ "weapon" : "sword" }', False),
],
),
# Last required property in last position
(
{
"properties": {
"name": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"age": {"type": "integer"},
"armor": {"type": "string"},
"strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
"weapon": {"title": "Weapon", "type": "string"},
},
"required": ["age", "armor", "weapon"],
"title": "Character",
"type": "object",
},
f'\\{{([\\n ]*"name"[\\n ]*:[\\n ]*(({STRING})|(null)|({STRING}null)|(null{STRING}))[\\n ]*,)?[\\n ]*"age"[\\n ]*:[\\n ]*{INTEGER}[\\n ]*,[\\n ]*"armor"[\\n ]*:[\\n ]*{STRING}[\\n ]*,([\\n ]*"strength"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER}))[\\n ]*,)?[\\n ]*"weapon"[\\n ]*:[\\n ]*{STRING}[\\n ]*\\}}',
[
(
'{ "name" : "Player", "age" : 10, "armor" : "plate", "strength" : 11, "weapon" : "sword" }',
True,
),
('{ "age" : 10, "armor" : "plate", "weapon" : "sword" }', True),
(
'{ "name" : "Kahlhanbeh", "armor" : "plate", "weapon" : "sword" }',
False,
),
],
),
# All properties are optional
(
{
"properties": {
"name": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"age": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
"strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]},
},
"title": "Character",
"type": "object",
},
f'\\{{([\\n ]*"name"[\\n ]*:[\\n ]*(({STRING})|(null)|({STRING}null)|(null{STRING}))([\\n ]*,[\\n ]*"age"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?([\\n ]*,[\\n ]*"strength"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?|([\\n ]*"name"[\\n ]*:[\\n ]*(({STRING})|(null)|({STRING}null)|(null{STRING}))[\\n ]*,)?[\\n ]*"age"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER}))([\\n ]*,[\\n ]*"strength"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?|([\\n ]*"name"[\\n ]*:[\\n ]*(({STRING})|(null)|({STRING}null)|(null{STRING}))[\\n ]*,)?([\\n ]*"age"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER}))[\\n ]*,)?[\\n ]*"strength"[\\n ]*:[\\n ]*(({INTEGER})|(null)|({INTEGER}null)|(null{INTEGER})))?[\\n ]*\\}}',
[
('{ "name" : "Player" }', True),
('{ "name" : "Player", "age" : 10, "strength" : 10 }', True),
('{ "age" : 10, "strength" : 10 }', True),
("{ }", True),
],
),
],
)
def test_match(schema, regex, examples):
Expand Down
3 changes: 2 additions & 1 deletion tests/generate/test_integration_transfomers.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ def test_transformers_json_schema():
"properties": {
"foo" : {"type": "integer"},
"bar": {"type": "string", "maxLength": 4}
}
},
"required": ["foo", "bar"]
}
"""

Expand Down