outlines-dev · rlouf · Apr 20, 2024 · Apr 13, 2024 · Apr 20, 2024
diff --git a/outlines/fsm/guide.py b/outlines/fsm/guide.py
@@ -111,9 +111,7 @@ class RegexGuide(Guide):
 
     def __init__(self, regex_string: str, tokenizer):
         @cache()
-        def create_states_mapping(
-            regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
-        ) -> Tuple[dict, set, set]:
+        def create_states_mapping(regex_string: str) -> Tuple[dict, set, set]:
             """Create the variables related to the mapping between states and tokens
             The parameters of the function are used for caching purpose
             """
@@ -143,10 +141,7 @@ def create_states_mapping(
             self.states_to_token_maps,
             self.empty_token_ids,
             fsm_finals,
-        ) = create_states_mapping(
-            regex_string, tuple(sorted(tokenizer.vocabulary.items()))
-        )
-        self.vocabulary = list(tokenizer.vocabulary.values())
+        ) = create_states_mapping(regex_string)
         self.eos_token_id = tokenizer.eos_token_id
         self.final_states = fsm_finals | {-1}
 
@@ -218,7 +213,7 @@ def from_interegular_fsm(
         from_interegular_instance = cls.__new__(cls)
 
         def create_states_mapping_from_interegular_fsm(
-            fsm: interegular.fsm.FSM, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
+            fsm: interegular.fsm.FSM,
         ) -> Tuple[dict, set]:
             """Create the variables related to the mapping between states and tokens
             The parameters of the function are used for caching purpose
@@ -245,10 +240,7 @@ def create_states_mapping_from_interegular_fsm(
         (
             from_interegular_instance.states_to_token_maps,
             from_interegular_instance.empty_token_ids,
-        ) = create_states_mapping_from_interegular_fsm(
-            interegular_fsm, tuple(sorted(tokenizer.vocabulary.items()))
-        )
-        from_interegular_instance.vocabulary = list(tokenizer.vocabulary.values())
+        ) = create_states_mapping_from_interegular_fsm(interegular_fsm)
         from_interegular_instance.eos_token_id = tokenizer.eos_token_id
         return from_interegular_instance
 

diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py
@@ -25,10 +25,10 @@
     "null": NULL,
 }
 
-DATE_TIME = r"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"
-DATE = r"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"
-TIME = r"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"
-UUID = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
+DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"'
+DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"'
+TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"'
+UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"'
 
 format_to_regex = {
     "uuid": UUID,

diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py
@@ -561,52 +561,56 @@ def test_match(schema, regex, examples):
             {"title": "Foo", "type": "string", "format": "uuid"},
             UUID,
             [
-                ("123e4567-e89b-12d3-a456-426614174000", True),
-                ("123e4567-e89b-12d3-a456-42661417400", False),
-                ("123e4567-e89b-12d3-a456-42661417400g", False),
-                ("123e4567-e89b-12d3-a456-42661417400-", False),
-                ("", False),
+                ("123e4567-e89b-12d3-a456-426614174000", False),
+                ('"123e4567-e89b-12d3-a456-426614174000"', True),
+                ('"123e4567-e89b-12d3-a456-42661417400"', False),
+                ('"123e4567-e89b-12d3-a456-42661417400g"', False),
+                ('"123e4567-e89b-12d3-a456-42661417400-"', False),
+                ('""', False),
             ],
         ),
         # DATE-TIME
         (
             {"title": "Foo", "type": "string", "format": "date-time"},
             DATE_TIME,
             [
-                ("2018-11-13T20:20:39Z", True),
-                ("2016-09-18T17:34:02.666Z", True),
-                ("2008-05-11T15:30:00Z", True),
-                ("2021-01-01T00:00:00", True),
-                ("2022-01-10 07:19:30", False),  # missing T
-                ("2022-12-10T10-04-29", False),  # incorrect separator
-                ("2023-01-01", False),
+                ("2018-11-13T20:20:39Z", False),
+                ('"2018-11-13T20:20:39Z"', True),
+                ('"2016-09-18T17:34:02.666Z"', True),
+                ('"2008-05-11T15:30:00Z"', True),
+                ('"2021-01-01T00:00:00"', True),
+                ('"2022-01-10 07:19:30"', False),  # missing T
+                ('"2022-12-10T10-04-29"', False),  # incorrect separator
+                ('"2023-01-01"', False),
             ],
         ),
         # DATE
         (
             {"title": "Foo", "type": "string", "format": "date"},
             DATE,
             [
-                ("2018-11-13", True),
-                ("2016-09-18", True),
-                ("2008-05-11", True),
-                ("2015-13-01", False),  # incorrect month
-                ("2022-01", False),  # missing day
-                ("2022/12/01", False),  # incorrect separator"
+                ("2018-11-13", False),
+                ('"2018-11-13"', True),
+                ('"2016-09-18"', True),
+                ('"2008-05-11"', True),
+                ('"2015-13-01"', False),  # incorrect month
+                ('"2022-01"', False),  # missing day
+                ('"2022/12/01"', False),  # incorrect separator"
             ],
         ),
         # TIME
         (
             {"title": "Foo", "type": "string", "format": "time"},
             TIME,
             [
-                ("20:20:39Z", True),
-                ("15:30:00Z", True),
-                ("25:30:00", False),  # incorrect hour
-                ("15:30", False),  # missing seconds
-                ("15:30:00.000", False),  # missing Z
-                ("15-30-00", False),  # incorrect separator
-                ("15:30:00+01:00", False),  # incorrect separator
+                ("20:20:39Z", False),
+                ('"20:20:39Z"', True),
+                ('"15:30:00Z"', True),
+                ('"25:30:00"', False),  # incorrect hour
+                ('"15:30"', False),  # missing seconds
+                ('"15:30:00.000"', False),  # missing Z
+                ('"15-30-00"', False),  # incorrect separator
+                ('"15:30:00+01:00"', False),  # incorrect separator
             ],
         ),
     ],
@@ -625,6 +629,98 @@ def test_format(schema, regex, examples):
             assert match is None
 
 
+@pytest.mark.parametrize(
+    "schema,examples",
+    [
+        # NESTED UUID
+        (
+            {
+                "title": "Foo",
+                "type": "object",
+                "properties": {"uuid": {"type": "string", "format": "uuid"}},
+            },
+            [
+                ('{"uuid": "123e4567-e89b-12d3-a456-426614174000"}', True),
+                ('{"uuid":"123e4567-e89b-12d3-a456-42661417400"}', False),
+                ('{"uuid":"123e4567-e89b-12d3-a456-42661417400g"}', False),
+                ('{"uuid":"123e4567-e89b-12d3-a456-42661417400-"}', False),
+                (
+                    '{"uuid":123e4567-e89b-12d3-a456-426614174000}',
+                    False,
+                ),  # missing quotes for value
+                ('{"uuid":""}', False),
+            ],
+        ),
+        # NESTED DATE-TIME
+        (
+            {
+                "title": "Foo",
+                "type": "object",
+                "properties": {"dateTime": {"type": "string", "format": "date-time"}},
+            },
+            [
+                ('{"dateTime": "2018-11-13T20:20:39Z"}', True),
+                ('{"dateTime":"2016-09-18T17:34:02.666Z"}', True),
+                ('{"dateTime":"2008-05-11T15:30:00Z"}', True),
+                ('{"dateTime":"2021-01-01T00:00:00"}', True),
+                ('{"dateTime":"2022-01-10 07:19:30"}', False),  # missing T
+                ('{"dateTime":"2022-12-10T10-04-29"}', False),  # incorrect separator
+                (
+                    '{"dateTime":2018-11-13T20:20:39Z}',
+                    False,
+                ),  # missing quotes for value
+                ('{"dateTime":"2023-01-01"}', False),
+            ],
+        ),
+        # NESTED DATE
+        (
+            {
+                "title": "Foo",
+                "type": "object",
+                "properties": {"date": {"type": "string", "format": "date"}},
+            },
+            [
+                ('{"date": "2018-11-13"}', True),
+                ('{"date":"2016-09-18"}', True),
+                ('{"date":"2008-05-11"}', True),
+                ('{"date":"2015-13-01"}', False),  # incorrect month
+                ('{"date":"2022-01"}', False),  # missing day
+                ('{"date":"2022/12/01"}', False),  # incorrect separator"
+                ('{"date":2018-11-13}', False),  # missing quotes for value
+            ],
+        ),
+        # NESTED TIME
+        (
+            {
+                "title": "Foo",
+                "type": "object",
+                "properties": {"time": {"type": "string", "format": "time"}},
+            },
+            [
+                ('{"time": "20:20:39Z"}', True),
+                ('{"time":"15:30:00Z"}', True),
+                ('{"time":"25:30:00"}', False),  # incorrect hour
+                ('{"time":"15:30"}', False),  # missing seconds
+                ('{"time":"15:30:00.000"}', False),  # missing Z
+                ('{"time":"15-30-00"}', False),  # incorrect separator
+                ('{"time":"15:30:00+01:00"}', False),  # incorrect separator
+                ('{"time":20:20:39Z}', False),  # missing quotes for value
+            ],
+        ),
+    ],
+)
+def test_format_without_regex(schema, examples):
+    schema = json.dumps(schema)
+    test_regex = build_regex_from_schema(schema)
+    for string, does_match in examples:
+        match = re.fullmatch(test_regex, string)
+        if does_match:
+            assert match[0] == string
+            assert match.span() == (0, len(string))
+        else:
+            assert match is None
+
+
 @pytest.mark.parametrize("whitespace_pattern", [None, r"[\n ]?", "abc"])
 def test_json_schema_custom_whitespace_pattern(whitespace_pattern):
     """assert whitespace_pattern setting respected"""