Skip to content

Commit

Permalink
parser: implement string escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
ncw committed Jun 4, 2015
1 parent 6125042 commit 60d12b3
Show file tree
Hide file tree
Showing 6 changed files with 312 additions and 18 deletions.
11 changes: 9 additions & 2 deletions notes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ Limitations

FIXME parsing 00007 works fine whereas it should be throwing an error

FIXME "\tstring\n" isn't lexing properly :-(

FIXME interesting crash with compiling `int("42"), sausage=11)`
Compile error: SystemError: [interface conversion: *ast.Call is not ast.SetCtxer: missing method SetCtx]

Expand Down Expand Up @@ -88,6 +86,15 @@ Arg struct {

And pass args using []Arg instead of StringDict

Compiler
========

Complete but without optimisation.

Easy wins
* Constant folding, eg -1 is LOAD_CONST 1; NEGATE
* Jump optimisation - compiler emits lots of jumps to jumps

Testing
=======

Expand Down
7 changes: 6 additions & 1 deletion parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,12 @@ found:
}
foundEndOfString:
if !rawString {
// FIXME expand / sequences
var err error
buf, err = DecodeEscape(buf, byteString)
if err != nil {
x.Errorf("Decode error: %v", err)
return eofError, nil
}
}
if byteString {
return STRING, py.Bytes(buf.Bytes())
Expand Down
28 changes: 14 additions & 14 deletions parser/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -546,49 +546,49 @@ func TestLexerReadString(t *testing.T) {
{``, eof, nil, ``},
{`1`, eof, nil, `1`},

{`""a`, STRING, py.String(``), `a`},
{`u"abc"`, STRING, py.String(`abc`), ``},
{`"a\nc"`, STRING, py.String(`a\nc`), ``},
{`""a`, STRING, py.String(""), `a`},
{`u"abc"`, STRING, py.String("abc"), ``},
{`"a\nc"`, STRING, py.String("a\nc"), ``},
{`r"a\nc"`, STRING, py.String(`a\nc`), ``},
{`"a\"c"`, STRING, py.String(`a\"c`), ``},
{`"a\\"+`, STRING, py.String(`a\\`), `+`},
{`"a`, eofError, nil, `a`},
{`"a\"c"`, STRING, py.String("a\"c"), ``},
{`"a\\"+`, STRING, py.String("a\\"), `+`},
{`"a`, eofError, nil, "a"},
{"\"a\n", eofError, nil, "a\n"},
{"\"a\\\nb\"c", STRING, py.String(`ab`), `c`},

{`''a`, STRING, py.String(``), `a`},
{`U'abc'`, STRING, py.String(`abc`), ``},
{`'a\nc'`, STRING, py.String(`a\nc`), ``},
{`'a\nc'`, STRING, py.String("a\nc"), ``},
{`R'a\nc'`, STRING, py.String(`a\nc`), ``},
{`'a\'c'`, STRING, py.String(`a\'c`), ``},
{`'a\'c'`, STRING, py.String("a'c"), ``},
{`'\n`, eofError, nil, `\n`},
{`'a`, eofError, nil, `a`},
{"'\\\n\\\npotato\\\nX\\\n'c", STRING, py.String(`potatoX`), `c`},

{`""""""a`, STRING, py.String(``), `a`},
{`u"""abc"""`, STRING, py.String(`abc`), ``},
{`"""a\nc"""`, STRING, py.String(`a\nc`), ``},
{`"""a\nc"""`, STRING, py.String("a\nc"), ``},
{`r"""a\"""c"""`, STRING, py.String(`a\"""c`), ``},
{`"""a\"""c"""`, STRING, py.String(`a\"""c`), ``},
{`"""a\"""c"""`, STRING, py.String(`a"""c`), ``},
{`"""a`, eofError, nil, `a`},
{"\"\"\"a\nb\nc\n\"\"\"\n", STRING, py.String("a\nb\nc\n"), "\n"},
{"\"\"\"a\nb\nc\na", eofError, nil, "a"},
{"\"\"\"a\\\nb\"\"\"c", STRING, py.String(`ab`), `c`},

{`''''''a`, STRING, py.String(``), `a`},
{`U'''abc'''`, STRING, py.String(`abc`), ``},
{`'''a\nc'''`, STRING, py.String(`a\nc`), ``},
{`'''a\nc'''`, STRING, py.String("a\nc"), ``},
{`R'''a\nc'''`, STRING, py.String(`a\nc`), ``},
{`'''a\'''c'''`, STRING, py.String(`a\'''c`), ``},
{`'''a\'''c'''`, STRING, py.String(`a'''c`), ``},
{`'''a`, eofError, nil, `a`},
{"'''a\nb\nc\n'''\n", STRING, py.String("a\nb\nc\n"), "\n"},
{"'''a\nb\nc\na", eofError, nil, "a"},
{"'''\\\na\\\nb\\\n'''c", STRING, py.String(`ab`), `c`},

{`b""a`, STRING, py.Bytes{}, "a"},
{`b'abc'`, STRING, py.Bytes(string(`abc`)), ``},
{`B"""a\nc"""`, STRING, py.Bytes(string(`a\nc`)), ``},
{`B'''a\"c'''`, STRING, py.Bytes(string(`a\"c`)), ``},
{`B"""a\nc"""`, STRING, py.Bytes(string("a\nc")), ``},
{`B'''a\"c'''`, STRING, py.Bytes(string(`a"c`)), ``},

{`rb""a`, STRING, py.Bytes{}, "a"},
{`bR'abc'`, STRING, py.Bytes(string(`abc`)), ``},
Expand Down
144 changes: 144 additions & 0 deletions parser/stringescape.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package parser

import (
"bytes"
"strconv"

"github.com/ncw/gpython/py"
)

// DecodeEscape unescapes a backslash-escaped buffer
//
// byteMode indicates whether we are creating a unicode string or a bytes output
func DecodeEscape(in *bytes.Buffer, byteMode bool) (out *bytes.Buffer, err error) {
// Early exit if no escape sequences
// NB in.Bytes() is cheap
inBytes := in.Bytes()
if bytes.IndexRune(inBytes, '\\') < 0 {
return in, nil
}
out = new(bytes.Buffer)
runes := bytes.Runes(inBytes)
decodeHex := func(what byte, i, size int) error {
i++
if i+size <= len(runes) {
cout, err := strconv.ParseInt(string(runes[i:i+size]), 16, 32)
if err != nil {
return py.ExceptionNewf(py.ValueError, "invalid \\%c escape at position %d", what, i-2)
}
if byteMode {
out.WriteByte(byte(cout))
} else {
out.WriteRune(rune(cout))
}
} else {
return py.ExceptionNewf(py.ValueError, "truncated \\%c escape at position %d", what, i-2)
}
return nil
}
ignoreEscape := false
for i := 0; i < len(runes); i++ {
c := runes[i]
if c != '\\' {
out.WriteRune(c)
continue
}
i++
if i >= len(runes) {
return nil, py.ExceptionNewf(py.ValueError, "Trailing \\ in string")
}
c = runes[i]
switch c {
case '\n':
case '\\':
out.WriteRune('\\')
case '\'':
out.WriteRune('\'')
case '"':
out.WriteRune('"')
case 'b':
out.WriteRune('\b')
case 'f':
out.WriteRune('\014') // FF
case 't':
out.WriteRune('\t')
case 'n':
out.WriteRune('\n')
case 'r':
out.WriteRune('\r')
case 'v':
out.WriteRune('\013') // VT
case 'a':
out.WriteRune('\007') // BEL, not classic C
case '0', '1', '2', '3', '4', '5', '6', '7':
// 1 to 3 characters of octal escape
cout := c - '0'
if i+1 < len(runes) && '0' <= runes[i+1] && runes[i+1] <= '7' {
i++
cout = (cout << 3) + runes[i] - '0'
if i+1 < len(runes) && '0' <= runes[i+1] && runes[i+1] <= '7' {
i++
cout = (cout << 3) + runes[i] - '0'
}
}
if byteMode {
out.WriteByte(byte(cout))
} else {
out.WriteRune(cout)
}
case 'x':
// \xhh exactly 2 characters of hex
err = decodeHex('x', i, 2)
if err != nil {
return nil, err
}
i += 2
// FIXME In a bytes literal, hexadecimal and
// octal escapes denote the byte with the
// given value. In a string literal, these
// escapes denote a Unicode character with the
// given value.
case 'u':
// \uxxxx Character with 16-bit hex value xxxx - 4 characters required
if byteMode {
ignoreEscape = true
break
}
err = decodeHex('u', i, 4)
if err != nil {
return nil, err
}
i += 4
case 'U':
// \Uxxxxxxxx Character with 32-bit hex value xxxxxxxx - 8 characters required
if byteMode {
ignoreEscape = true
break
}

err = decodeHex('U', i, 8)
if err != nil {
return nil, err
}
i += 8
case 'N':
// \N{name} Character named name in the Unicode database
if byteMode {
ignoreEscape = true
break
}
// FIXME go can't do this as builtin so ignore for the moment
ignoreEscape = true
default:
ignoreEscape = true
break
}
// ignore unrecognised escape
if ignoreEscape {
i--
out.WriteRune('\\')
ignoreEscape = false
}
}
return out, nil
}
138 changes: 138 additions & 0 deletions parser/stringescape_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
package parser

import (
"bytes"
"testing"

"github.com/ncw/gpython/py"
)

func TestDecodeEscape(t *testing.T) {
for _, test := range []struct {
in string
want string
errString string
byteMode bool
}{
// Stringmode tests
{``, ``, "", false},
{`Potato`, `Potato`, "", false},
{`Potato\`, ``, `Trailing \ in string`, false},
{`\Potato`, `\Potato`, "", false},
{`n\\`, `n\`, "", false},
{`\'x`, `'x`, "", false},
{`\"`, `"`, "", false},
{"\\\n", ``, "", false},
{`\b`, "\010", "", false},
{`\f`, "\014", "", false},
{`\t`, "\011", "", false},
{`\n`, "\012", "", false},
{`\r`, "\015", "", false},
{`\v`, "\013", "", false},
{`\a`, "\007", "", false},
{`\1`, "\001", "", false},
{`\12`, "\012", "", false},
{`\123`, "\123", "", false},
{`\777`, "\u01ff", "", false},
{`\1\12\123\1234`, "\001\012\123\123" + "4", "", false},
{`a\1a\12a\123a`, "a\001a\012a\123a", "", false},
{`\x`, "", `truncated \x escape at position 0`, false},
{`\x1`, "", `truncated \x escape at position 0`, false},
{`\x11`, "\x11", "", false},
{`\xzz`, "", `invalid \x escape at position 0`, false},
{`{\x11}`, "{\x11}", "", false},
{`\x01\x8a\xff`, "\x01\u008a\u00ff", "", false},
{`\x01\x8A\xFF`, "\x01\u008a\u00ff", "", false},
{`\u`, "", `truncated \u escape at position 0`, false},
{`\u1`, "", `truncated \u escape at position 0`, false},
{`\u12`, "", `truncated \u escape at position 0`, false},
{`z\u134`, "", `truncated \u escape at position 1`, false},
{`\u1234`, "\u1234", "", false},
{`z\uzzzz`, "", `invalid \u escape at position 1`, false},
{`{\u1234}`, "{\u1234}", "", false},
{`\U00000001\U0000018a\U000012ff`, "\U00000001\U0000018a\U000012ff", "", false},
{`\U00000001\U0000018A\U000012FF`, "\U00000001\U0000018a\U000012ff", "", false},
{`\U0000`, "", `truncated \U escape at position 0`, false},
{`\U00001`, "", `truncated \U escape at position 0`, false},
{`\U000012`, "", `truncated \U escape at position 0`, false},
{`z\U0000134`, "", `truncated \U escape at position 1`, false},
{`\U00001234`, "\U00001234", "", false},
{`z\Uzzzzzzzz`, "", `invalid \U escape at position 1`, false},
{`{\U00001234}`, "{\U00001234}", "", false},
{`\U00000001\U0000018a\U000012ff`, "\U00000001\U0000018a\U000012ff", "", false},
{`\U00000001\U0000018A\U000012FF`, "\U00000001\U0000018a\U000012ff", "", false},
{`\N{potato}`, `\N{potato}`, "", false},

// Bytemode tests
{``, ``, "", true},
{`Potato`, `Potato`, "", true},
{`Potato\`, ``, `Trailing \ in string`, true},
{`\Potato`, `\Potato`, "", true},
{`n\\`, `n\`, "", true},
{`\'x`, `'x`, "", true},
{`\"`, `"`, "", true},
{"\\\n", ``, "", true},
{`\b`, "\010", "", true},
{`\f`, "\014", "", true},
{`\t`, "\011", "", true},
{`\n`, "\012", "", true},
{`\r`, "\015", "", true},
{`\v`, "\013", "", true},
{`\a`, "\007", "", true},
{`\1`, "\001", "", true},
{`\12`, "\012", "", true},
{`\123`, "\123", "", true},
{`\777`, "\xff", "", true},
{`\1\12\123\1234`, "\001\012\123\123" + "4", "", true},
{`a\1a\12a\123a`, "a\001a\012a\123a", "", true},
{`\x`, "", `truncated \x escape at position 0`, true},
{`\x1`, "", `truncated \x escape at position 0`, true},
{`\x11`, "\x11", "", true},
{`\xzz`, "", `invalid \x escape at position 0`, true},
{`{\x11}`, "{\x11}", "", true},
{`\x01\x8a\xff`, "\x01\x8a\xff", "", true},
{`\x01\x8A\xFF`, "\x01\x8a\xff", "", true},
{`\u`, `\u`, "", true},
{`\u1`, `\u1`, "", true},
{`\u12`, `\u12`, "", true},
{`z\u134`, `z\u134`, "", true},
{`\u1234`, `\u1234`, "", true},
{`z\uzzzz`, `z\uzzzz`, "", true},
{`{\u1234}`, `{\u1234}`, "", true},
{`\U00000001\U0000018a\U000012ff`, `\U00000001\U0000018a\U000012ff`, "", true},
{`\U00000001\U0000018A\U000012FF`, `\U00000001\U0000018A\U000012FF`, "", true},
{`\U0000`, `\U0000`, "", true},
{`\U00001`, `\U00001`, "", true},
{`\U000012`, `\U000012`, "", true},
{`z\U0000134`, `z\U0000134`, "", true},
{`\U00001234`, `\U00001234`, "", true},
{`z\Uzzzzzzzz`, `z\Uzzzzzzzz`, "", true},
{`{\U00001234}`, `{\U00001234}`, "", true},
{`\U00000001\U0000018a\U000012ff`, `\U00000001\U0000018a\U000012ff`, "", true},
{`\U00000001\U0000018A\U000012FF`, `\U00000001\U0000018A\U000012FF`, "", true},
{`\N{potato}`, `\N{potato}`, "", true},
} {
in := bytes.NewBufferString(test.in)
out, err := DecodeEscape(in, test.byteMode)
if err != nil {
if test.errString == "" {
t.Errorf("%q: not expecting error but got: %v", test.in, err)
} else {
exc := err.(*py.Exception)
args := exc.Args.(py.Tuple)
if string(args[0].(py.String)) != test.errString {
t.Errorf("%q: want error %q but got %q", test.in, test.errString, args[0])
}
}
continue
}
if test.errString != "" {
t.Errorf("%q: expecting error but didn't get one", test.in)
continue
}
got := out.String()
if test.want != got {
t.Errorf("%q: want %q but got %q", test.in, test.want, got)
}
}
}
2 changes: 1 addition & 1 deletion py/tests/int.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def assertRaises(expecting, s, base=None):
doc="whitespace"
assert int(" +100000", 0) == +tenE5
assert int("+100000 ", 0) == +tenE5
# FIXME broken in lexer? assert int("\t\t\t\t100000\t\t\t\t", 0) == tenE5
assert int("\t\t\t\t100000\t\t\t\t", 0) == tenE5
assert int(" 100000 ", 0) == tenE5

doc="sigils"
Expand Down

0 comments on commit 60d12b3

Please sign in to comment.