Improved tokenization algorithm

This commit brings a BIG update to the tokenization algorithm: instead of creating "recursive" tokens, the complete tokenization of all of the source text now happens iteratively in a single loop. The nifty `--debug` command line option lets you inspect how the input is tokenized. The new algorithm reduces the complexity of the modes to just a single `create_token()` method, solves the line numbering problem (#18), and solves various incorrect tokenization issues (such as #20). I would like to thank my former professor J.J. Paijmans for teaching me the knowledge required to pull of this feat. Thanks Paai! Besides this, a new mode `InsideHTMLTag` was added to allow for different rules within HTML tags than outside of it. This means that multiline html tags are now tokenized and parsed correctly. Finally, the list of Django block tags is gone and they are now recognized by looking ahead for end tags, which eliminates the need for a configuration option to register custom template tags (#16). Closes #16 Closes #18 (again) Closes #20
rtts · May 23, 2021 · 248e90c · 248e90c
1 parent 1046226
commit 248e90c
Show file tree

Hide file tree

Showing 8 changed files with 309 additions and 336 deletions.
diff --git a/djhtml/__main__.py b/djhtml/__main__.py
@@ -81,14 +81,14 @@ def main():
 
         try:
             if args.debug:
-                for line in Mode(source).tokenize(args.tabwidth):
-                    print(repr(line.tokens))
+                print(Mode(source).debug())
                 sys.exit()
             result = Mode(source).indent(args.tabwidth)
         except SyntaxError as e:
             if not args.quiet:
                 print(
-                    f"Error in {input_file.name}: {str(e) or e.__class__.__name__}",
+                    f"Syntax error in {input_file.name}:"
+                    f" {str(e) or e.__class__.__name__}",
                     file=sys.stderr,
                 )
             exit_status = 1
@@ -112,10 +112,13 @@ def main():
                     file=sys.stderr,
                 )
         elif not args.quiet:
-            print(
-                f"{input_file.name} is perfectly indented!",
-                file=sys.stderr,
-            )
+            if not args.in_place and args.output_file == "-":
+                print(result, end="")
+            else:
+                print(
+                    f"{input_file.name} is perfectly indented!",
+                    file=sys.stderr,
+                )
 
     sys.exit(exit_status)
 

diff --git a/djhtml/lines.py b/djhtml/lines.py
@@ -4,48 +4,47 @@ class Line:
 
     """
 
-    def __init__(self, tabwidth):
-        self.tabwidth = tabwidth
+    def __init__(self, line_nr=1):
+        self.line_nr = line_nr
         self.tokens = []
         self.level = 0
-        self.offset = 0
 
     def append(self, token):
         """
         Append tokens to the line.
 
         """
+        token.line_nr = self.line_nr
         self.tokens.append(token)
 
     @property
     def text(self):
         """
-        The raw, unindented text of this line.
+        The unindented text of this line without leading/trailing spaces.
 
         """
         return "".join([str(token) for token in self.tokens]).strip()
 
-    def __str__(self):
+    def indent(self, tabwidth):
         """
         The final, indented text of this line. Make sure to set the level
-        and optionally offset before calling ``str()``.
+        and optionally offset before calling this method.
 
         """
-        # If the line consists of a recursive token, return its
-        # rendered output instead.
-        if self.tokens and self.tokens[0].recursive:
-            token = self.tokens[0]
-            return token.mode(token.text, self.level, token.line_nr).indent(
-                self.tabwidth
-            )
-
-        if self.text:
-            spaces = self.tabwidth * self.level + self.offset
-            return " " * spaces + self.text + "\n"
+        if self.tokens:
+            if self.tokens[0].ignore:
+                return "".join([str(token) for token in self.tokens]) + "\n"
+            elif self.text:
+                offset = self.tokens[0].offset * tabwidth
+                spaces = tabwidth * self.level + offset
+                return " " * spaces + self.text + "\n"
         return "\n"
 
     def __repr__(self):
         return repr(self.tokens)
 
     def __bool__(self):
-        return bool(self.text)
+        return bool(self.tokens and self.text)
+
+    def __next__(self):
+        return Line(line_nr=self.line_nr + 1)