Skip to content

Commit

Permalink
Improved tokenization algorithm
Browse files Browse the repository at this point in the history
This commit brings a BIG update to the tokenization algorithm: instead of
creating "recursive" tokens, the complete tokenization of all of the source
text now happens iteratively in a single loop. The nifty `--debug` command
line option lets you inspect how the input is tokenized.

The new algorithm reduces the complexity of the modes to just a single
`create_token()` method, solves the line numbering problem (#18), and solves
various incorrect tokenization issues (such as #20).

I would like to thank my former professor J.J. Paijmans for teaching me the
knowledge required to pull of this feat. Thanks Paai!

Besides this, a new mode `InsideHTMLTag` was added to allow for different
rules within HTML tags than outside of it. This means that multiline html
tags are now tokenized and parsed correctly.

Finally, the list of Django block tags is gone and they are now recognized
by looking ahead for end tags, which eliminates the need for a configuration
option to register custom template tags (#16).

Closes #16
Closes #18 (again)
Closes #20
  • Loading branch information
JaapJoris committed May 23, 2021
1 parent 1046226 commit 248e90c
Show file tree
Hide file tree
Showing 8 changed files with 309 additions and 336 deletions.
17 changes: 10 additions & 7 deletions djhtml/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@ def main():

try:
if args.debug:
for line in Mode(source).tokenize(args.tabwidth):
print(repr(line.tokens))
print(Mode(source).debug())
sys.exit()
result = Mode(source).indent(args.tabwidth)
except SyntaxError as e:
if not args.quiet:
print(
f"Error in {input_file.name}: {str(e) or e.__class__.__name__}",
f"Syntax error in {input_file.name}:"
f" {str(e) or e.__class__.__name__}",
file=sys.stderr,
)
exit_status = 1
Expand All @@ -112,10 +112,13 @@ def main():
file=sys.stderr,
)
elif not args.quiet:
print(
f"{input_file.name} is perfectly indented!",
file=sys.stderr,
)
if not args.in_place and args.output_file == "-":
print(result, end="")
else:
print(
f"{input_file.name} is perfectly indented!",
file=sys.stderr,
)

sys.exit(exit_status)

Expand Down
35 changes: 17 additions & 18 deletions djhtml/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,47 @@ class Line:
"""

def __init__(self, tabwidth):
self.tabwidth = tabwidth
def __init__(self, line_nr=1):
self.line_nr = line_nr
self.tokens = []
self.level = 0
self.offset = 0

def append(self, token):
"""
Append tokens to the line.
"""
token.line_nr = self.line_nr
self.tokens.append(token)

@property
def text(self):
"""
The raw, unindented text of this line.
The unindented text of this line without leading/trailing spaces.
"""
return "".join([str(token) for token in self.tokens]).strip()

def __str__(self):
def indent(self, tabwidth):
"""
The final, indented text of this line. Make sure to set the level
and optionally offset before calling ``str()``.
and optionally offset before calling this method.
"""
# If the line consists of a recursive token, return its
# rendered output instead.
if self.tokens and self.tokens[0].recursive:
token = self.tokens[0]
return token.mode(token.text, self.level, token.line_nr).indent(
self.tabwidth
)

if self.text:
spaces = self.tabwidth * self.level + self.offset
return " " * spaces + self.text + "\n"
if self.tokens:
if self.tokens[0].ignore:
return "".join([str(token) for token in self.tokens]) + "\n"
elif self.text:
offset = self.tokens[0].offset * tabwidth
spaces = tabwidth * self.level + offset
return " " * spaces + self.text + "\n"
return "\n"

def __repr__(self):
return repr(self.tokens)

def __bool__(self):
return bool(self.text)
return bool(self.tokens and self.text)

def __next__(self):
return Line(line_nr=self.line_nr + 1)
Loading

0 comments on commit 248e90c

Please sign in to comment.