Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve BOM in web editor (#28935) #28959

Merged
merged 1 commit into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 22 additions & 21 deletions modules/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@ import (
// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

type ConvertOpts struct {
KeepBOM bool
}

// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {
func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
buf := make([]byte, 2048)
n, err := util.ReadAtMost(rd, buf)
if err != nil {
return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
}

charsetLabel, err := DetectEncoding(buf[:n])
if err != nil || charsetLabel == "UTF-8" {
return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
}

encoding, _ := charset.Lookup(charsetLabel)
Expand All @@ -42,20 +46,20 @@ func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {

return transform.NewReader(
io.MultiReader(
bytes.NewReader(RemoveBOMIfPresent(buf[:n])),
bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)),
rd,
),
encoding.NewDecoder(),
)
}

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
// ToUTF8 converts content to UTF8 encoding
func ToUTF8(content []byte, opts ConvertOpts) (string, error) {
charsetLabel, err := DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(RemoveBOMIfPresent(content)), nil
return string(MaybeRemoveBOM(content, opts)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
Expand All @@ -70,28 +74,22 @@ func ToUTF8WithErr(content []byte) (string, error) {
result = append(result, content[n:]...)
}

result = RemoveBOMIfPresent(result)
result = MaybeRemoveBOM(result, opts)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content)))
func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts))
return bs
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func ToUTF8DropErrors(content []byte) []byte {
func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
return MaybeRemoveBOM(content, opts)
}

encoding, _ := charset.Lookup(charsetLabel)
Expand All @@ -117,11 +115,14 @@ func ToUTF8DropErrors(content []byte) []byte {
}
}

return RemoveBOMIfPresent(decoded)
return MaybeRemoveBOM(decoded, opts)
}

// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
func RemoveBOMIfPresent(content []byte) []byte {
// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
if opts.KeepBOM {
return content
}
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
return content[3:]
}
Expand Down
133 changes: 34 additions & 99 deletions modules/charset/charset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ func resetDefaultCharsetsOrder() {
}
}

func TestRemoveBOMIfPresent(t *testing.T) {
res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
func TestMaybeRemoveBOM(t *testing.T) {
res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)

res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
}

func TestToUTF8WithErr(t *testing.T) {
func TestToUTF8(t *testing.T) {
resetDefaultCharsetsOrder()
var res string
var err error
Expand All @@ -47,84 +47,84 @@ func TestToUTF8WithErr(t *testing.T) {
// locale, so some conversions might behave differently. For that reason, we don't
// depend on particular conversions but in expected behaviors.

res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
res, err = ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
assert.NoError(t, err)
assert.Equal(t, "ABC", res)

// "áéíóú"
res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.NoError(t, err)
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))

// "áéíóú"
res, err = ToUTF8WithErr([]byte{
res, err = ToUTF8([]byte{
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
0xc3, 0xba,
})
}, ConvertOpts{})
assert.NoError(t, err)
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))

res, err = ToUTF8WithErr([]byte{
res, err = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
})
}, ConvertOpts{})
assert.NoError(t, err)
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)

res, err = ToUTF8WithErr([]byte{
res, err = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
})
}, ConvertOpts{})
assert.NoError(t, err)
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)

res, err = ToUTF8WithErr([]byte{
res, err = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
})
}, ConvertOpts{})
assert.NoError(t, err)
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)

// Japanese (Shift-JIS)
// 日属秘ぞしちゅ。
res, err = ToUTF8WithErr([]byte{
res, err = ToUTF8([]byte{
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
0xBF, 0x82, 0xE3, 0x81, 0x42,
})
}, ConvertOpts{})
assert.NoError(t, err)
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
},
[]byte(res))

res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
assert.NoError(t, err)
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
}

func TestToUTF8WithFallback(t *testing.T) {
resetDefaultCharsetsOrder()
// "ABC"
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)

// "áéíóú"
res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)

// UTF8 BOM + "áéíóú"
res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)

// "Hola, así cómo ños"
res = ToUTF8WithFallback([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73,
})
}, ConvertOpts{})
assert.Equal(t, []byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73,
Expand All @@ -133,126 +133,65 @@ func TestToUTF8WithFallback(t *testing.T) {
// "Hola, así cómo "
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}

res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

// Japanese (Shift-JIS)
// "日属秘ぞしちゅ。"
res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
}, res)

res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}

func TestToUTF8(t *testing.T) {
resetDefaultCharsetsOrder()
// Note: golang compiler seems so behave differently depending on the current
// locale, so some conversions might behave differently. For that reason, we don't
// depend on particular conversions but in expected behaviors.

res := ToUTF8(string([]byte{0x41, 0x42, 0x43}))
assert.Equal(t, "ABC", res)

// "áéíóú"
res = ToUTF8(string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}))
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))

// BOM + "áéíóú"
res = ToUTF8(string([]byte{
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
0xc3, 0xba,
}))
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))

// Latin1
// Hola, así cómo ños
res = ToUTF8(string([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73,
}))
assert.Equal(t, []byte{
0x48, 0x6f, 0x6c, 0x61, 0x2c, 0x20, 0x61, 0x73, 0xc3, 0xad, 0x20, 0x63,
0xc3, 0xb3, 0x6d, 0x6f, 0x20, 0xc3, 0xb1, 0x6f, 0x73,
}, []byte(res))

// Latin1
// Hola, así cómo \x07ños
res = ToUTF8(string([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73,
}))
// Hola,
bytesMustStartWith(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C}, []byte(res))

// This test FAILS
// res = ToUTF8("Hola, así cómo \x81ños")
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
// assert.Regexp(t, "^Hola, así cómo", res)

// Japanese (Shift-JIS)
// 日属秘ぞしちゅ。
res = ToUTF8(string([]byte{
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
0xBF, 0x82, 0xE3, 0x81, 0x42,
}))
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
},
[]byte(res))

res = ToUTF8("\x00\x00\x00\x00")
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
}

func TestToUTF8DropErrors(t *testing.T) {
resetDefaultCharsetsOrder()
// "ABC"
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)

// "áéíóú"
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)

// UTF8 BOM + "áéíóú"
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)

// "Hola, así cómo ños"
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{})
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
assert.Equal(t, []byte{0x73}, res[len(res)-1:])

// "Hola, así cómo "
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}

res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

// Japanese (Shift-JIS)
// "日属秘ぞしちゅ。"
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
}, res)

res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}

Expand Down Expand Up @@ -302,10 +241,6 @@ func stringMustEndWith(t *testing.T, expected, value string) {
assert.Equal(t, expected, value[len(value)-len(expected):])
}

func bytesMustStartWith(t *testing.T, expected, value []byte) {
assert.Equal(t, expected, value[:len(expected)])
}

func TestToUTF8WithFallbackReader(t *testing.T) {
resetDefaultCharsetsOrder()

Expand All @@ -317,7 +252,7 @@ func TestToUTF8WithFallbackReader(t *testing.T) {
}
input = input[:testLen]
input += "// Выключаем"
rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)))
rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{})
r, _ := io.ReadAll(rd)
assert.EqualValuesf(t, input, string(r), "testing string len=%d", testLen)
}
Expand Down
2 changes: 1 addition & 1 deletion modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
Expand Down
Loading
Loading