Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Force dates, uuid, datetimes, times to be between quotes #831

Merged
merged 2 commits into from
Apr 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 4 additions & 12 deletions outlines/fsm/guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,7 @@ class RegexGuide(Guide):

def __init__(self, regex_string: str, tokenizer):
@cache()
def create_states_mapping(
regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
) -> Tuple[dict, set, set]:
def create_states_mapping(regex_string: str) -> Tuple[dict, set, set]:
"""Create the variables related to the mapping between states and tokens
The parameters of the function are used for caching purpose
"""
Expand Down Expand Up @@ -143,10 +141,7 @@ def create_states_mapping(
self.states_to_token_maps,
self.empty_token_ids,
fsm_finals,
) = create_states_mapping(
regex_string, tuple(sorted(tokenizer.vocabulary.items()))
)
self.vocabulary = list(tokenizer.vocabulary.values())
) = create_states_mapping(regex_string)
self.eos_token_id = tokenizer.eos_token_id
self.final_states = fsm_finals | {-1}

Expand Down Expand Up @@ -218,7 +213,7 @@ def from_interegular_fsm(
from_interegular_instance = cls.__new__(cls)

def create_states_mapping_from_interegular_fsm(
fsm: interegular.fsm.FSM, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
fsm: interegular.fsm.FSM,
) -> Tuple[dict, set]:
"""Create the variables related to the mapping between states and tokens
The parameters of the function are used for caching purpose
Expand All @@ -245,10 +240,7 @@ def create_states_mapping_from_interegular_fsm(
(
from_interegular_instance.states_to_token_maps,
from_interegular_instance.empty_token_ids,
) = create_states_mapping_from_interegular_fsm(
interegular_fsm, tuple(sorted(tokenizer.vocabulary.items()))
)
from_interegular_instance.vocabulary = list(tokenizer.vocabulary.values())
) = create_states_mapping_from_interegular_fsm(interegular_fsm)
from_interegular_instance.eos_token_id = tokenizer.eos_token_id
return from_interegular_instance

Expand Down
8 changes: 4 additions & 4 deletions outlines/fsm/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
"null": NULL,
}

DATE_TIME = r"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"
DATE = r"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"
TIME = r"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"
UUID = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"'
DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"'
TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"'
UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"'

format_to_regex = {
"uuid": UUID,
Expand Down
146 changes: 121 additions & 25 deletions tests/fsm/test_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,52 +561,56 @@ def test_match(schema, regex, examples):
{"title": "Foo", "type": "string", "format": "uuid"},
UUID,
[
("123e4567-e89b-12d3-a456-426614174000", True),
("123e4567-e89b-12d3-a456-42661417400", False),
("123e4567-e89b-12d3-a456-42661417400g", False),
("123e4567-e89b-12d3-a456-42661417400-", False),
("", False),
("123e4567-e89b-12d3-a456-426614174000", False),
('"123e4567-e89b-12d3-a456-426614174000"', True),
('"123e4567-e89b-12d3-a456-42661417400"', False),
('"123e4567-e89b-12d3-a456-42661417400g"', False),
('"123e4567-e89b-12d3-a456-42661417400-"', False),
('""', False),
],
),
# DATE-TIME
(
{"title": "Foo", "type": "string", "format": "date-time"},
DATE_TIME,
[
("2018-11-13T20:20:39Z", True),
("2016-09-18T17:34:02.666Z", True),
("2008-05-11T15:30:00Z", True),
("2021-01-01T00:00:00", True),
("2022-01-10 07:19:30", False), # missing T
("2022-12-10T10-04-29", False), # incorrect separator
("2023-01-01", False),
("2018-11-13T20:20:39Z", False),
('"2018-11-13T20:20:39Z"', True),
('"2016-09-18T17:34:02.666Z"', True),
('"2008-05-11T15:30:00Z"', True),
('"2021-01-01T00:00:00"', True),
('"2022-01-10 07:19:30"', False), # missing T
('"2022-12-10T10-04-29"', False), # incorrect separator
('"2023-01-01"', False),
],
),
# DATE
(
{"title": "Foo", "type": "string", "format": "date"},
DATE,
[
("2018-11-13", True),
("2016-09-18", True),
("2008-05-11", True),
("2015-13-01", False), # incorrect month
("2022-01", False), # missing day
("2022/12/01", False), # incorrect separator"
("2018-11-13", False),
('"2018-11-13"', True),
('"2016-09-18"', True),
('"2008-05-11"', True),
('"2015-13-01"', False), # incorrect month
('"2022-01"', False), # missing day
('"2022/12/01"', False), # incorrect separator"
],
),
# TIME
(
{"title": "Foo", "type": "string", "format": "time"},
TIME,
[
("20:20:39Z", True),
("15:30:00Z", True),
("25:30:00", False), # incorrect hour
("15:30", False), # missing seconds
("15:30:00.000", False), # missing Z
("15-30-00", False), # incorrect separator
("15:30:00+01:00", False), # incorrect separator
("20:20:39Z", False),
('"20:20:39Z"', True),
('"15:30:00Z"', True),
('"25:30:00"', False), # incorrect hour
('"15:30"', False), # missing seconds
('"15:30:00.000"', False), # missing Z
('"15-30-00"', False), # incorrect separator
('"15:30:00+01:00"', False), # incorrect separator
],
),
],
Expand All @@ -625,6 +629,98 @@ def test_format(schema, regex, examples):
assert match is None


@pytest.mark.parametrize(
"schema,examples",
[
# NESTED UUID
(
{
"title": "Foo",
"type": "object",
"properties": {"uuid": {"type": "string", "format": "uuid"}},
},
[
('{"uuid": "123e4567-e89b-12d3-a456-426614174000"}', True),
('{"uuid":"123e4567-e89b-12d3-a456-42661417400"}', False),
('{"uuid":"123e4567-e89b-12d3-a456-42661417400g"}', False),
('{"uuid":"123e4567-e89b-12d3-a456-42661417400-"}', False),
(
'{"uuid":123e4567-e89b-12d3-a456-426614174000}',
False,
), # missing quotes for value
('{"uuid":""}', False),
],
),
# NESTED DATE-TIME
(
{
"title": "Foo",
"type": "object",
"properties": {"dateTime": {"type": "string", "format": "date-time"}},
},
[
('{"dateTime": "2018-11-13T20:20:39Z"}', True),
('{"dateTime":"2016-09-18T17:34:02.666Z"}', True),
('{"dateTime":"2008-05-11T15:30:00Z"}', True),
('{"dateTime":"2021-01-01T00:00:00"}', True),
('{"dateTime":"2022-01-10 07:19:30"}', False), # missing T
('{"dateTime":"2022-12-10T10-04-29"}', False), # incorrect separator
(
'{"dateTime":2018-11-13T20:20:39Z}',
False,
), # missing quotes for value
('{"dateTime":"2023-01-01"}', False),
],
),
# NESTED DATE
(
{
"title": "Foo",
"type": "object",
"properties": {"date": {"type": "string", "format": "date"}},
},
[
('{"date": "2018-11-13"}', True),
('{"date":"2016-09-18"}', True),
('{"date":"2008-05-11"}', True),
('{"date":"2015-13-01"}', False), # incorrect month
('{"date":"2022-01"}', False), # missing day
('{"date":"2022/12/01"}', False), # incorrect separator"
('{"date":2018-11-13}', False), # missing quotes for value
],
),
# NESTED TIME
(
{
"title": "Foo",
"type": "object",
"properties": {"time": {"type": "string", "format": "time"}},
},
[
('{"time": "20:20:39Z"}', True),
('{"time":"15:30:00Z"}', True),
('{"time":"25:30:00"}', False), # incorrect hour
('{"time":"15:30"}', False), # missing seconds
('{"time":"15:30:00.000"}', False), # missing Z
('{"time":"15-30-00"}', False), # incorrect separator
('{"time":"15:30:00+01:00"}', False), # incorrect separator
('{"time":20:20:39Z}', False), # missing quotes for value
],
),
],
)
def test_format_without_regex(schema, examples):
schema = json.dumps(schema)
test_regex = build_regex_from_schema(schema)
for string, does_match in examples:
match = re.fullmatch(test_regex, string)
if does_match:
assert match[0] == string
assert match.span() == (0, len(string))
else:
assert match is None


@pytest.mark.parametrize("whitespace_pattern", [None, r"[\n ]?", "abc"])
def test_json_schema_custom_whitespace_pattern(whitespace_pattern):
"""assert whitespace_pattern setting respected"""
Expand Down
Loading