Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit warnings when dumping binary strings #643

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError, Enco

static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode;

static int usascii_encindex, utf8_encindex, binary_encindex;

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
Expand Down Expand Up @@ -498,7 +500,7 @@ static VALUE mString_to_json_raw_object(VALUE self)
VALUE result = rb_hash_new();
rb_hash_aset(result, rb_funcall(mJSON, i_create_id, 0), rb_class_name(rb_obj_class(self)));
ary = rb_funcall(self, i_unpack, 1, rb_str_new2("C*"));
rb_hash_aset(result, rb_str_new2("raw"), ary);
rb_hash_aset(result, rb_utf8_str_new_lit("raw"), ary);
return result;
}

Expand Down Expand Up @@ -749,8 +751,6 @@ static void generate_json_array(FBuffer *buffer, VALUE Vstate, JSON_Generator_St
fbuffer_append_char(buffer, ']');
}

static int usascii_encindex, utf8_encindex, binary_encindex;

static inline int enc_utf8_compatible_p(int enc_idx)
{
if (enc_idx == usascii_encindex) return 1;
Expand All @@ -764,13 +764,14 @@ static inline VALUE ensure_valid_encoding(VALUE str)
VALUE utf8_string;
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
return utf8_string;
case ENC_CODERANGE_VALID:
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Raise in 3.0.0
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
return utf8_string;
break;
}
Expand Down
25 changes: 14 additions & 11 deletions ext/json/ext/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -1795,9 +1795,12 @@ static VALUE convert_encoding(VALUE source)

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
Expand Down Expand Up @@ -1946,15 +1949,15 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}


#line 1950 "parser.c"
#line 1953 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};

enum {JSON_en_main = 1};


#line 858 "parser.rl"
#line 861 "parser.rl"


/*
Expand All @@ -1972,16 +1975,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;


#line 1976 "parser.c"
#line 1979 "parser.c"
{
cs = JSON_start;
}

#line 875 "parser.rl"
#line 878 "parser.rl"
p = json->source;
pe = p + json->len;

#line 1985 "parser.c"
#line 1988 "parser.c"
{
if ( p == pe )
goto _test_eof;
Expand Down Expand Up @@ -2015,7 +2018,7 @@ case 1:
cs = 0;
goto _out;
tr2:
#line 850 "parser.rl"
#line 853 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
Expand All @@ -2025,7 +2028,7 @@ cs = 0;
if ( ++p == pe )
goto _test_eof10;
case 10:
#line 2029 "parser.c"
#line 2032 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
Expand Down Expand Up @@ -2114,7 +2117,7 @@ case 9:
_out: {}
}

#line 878 "parser.rl"
#line 881 "parser.rl"

if (cs >= JSON_first_final && p == pe) {
return result;
Expand Down
9 changes: 6 additions & 3 deletions ext/json/ext/parser/parser.rl
Original file line number Diff line number Diff line change
Expand Up @@ -690,9 +690,12 @@ static VALUE convert_encoding(VALUE source)

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
Comment on lines +693 to +698
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is #138. I'm having second thoughts about deprecating / removing it, as I fear lots of people depend on it, and looking at other parsers they all seem to accept it. So maybe we should just support it in the Java parser?

WDYT @headius @eregon ?

Copy link
Contributor

@headius headius Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, I commented too early and thought this was a YAML issue.

After reexamining the code, it does appear that the json implementation for JRuby works directly with bytes via the ragel parser. It would probably be possible to make it accept invalid characters.

My position remains the same, however. I'm not sure what other parsers you tested, but all json specifications I looked at explicitly require content to be in one of the main unicode UTF encodings. That says to me that providing invalid characters is off spec and should produce an error.

The original RFC from 2006 even says:

JSON text SHALL be encoded in Unicode. The default encoding is UTF-8.

https://www.ietf.org/rfc/rfc4627.txt

The character stream cannot be parsed as Unicode, then it is not up to spec.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what other parsers you tested,

oj and Python's stdlib.

I totally get your argument, and generally agree, the concern is really about breaking existing (misbehaving) code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm crazy, but I would break it in a minor release so it's easy to revert with another minor release, rather than break it in a major release and have to go back. I doubt the impact will be large. If we put out a minor release that starts erroring on bad input and it doesn't get reported for a few months, it's probably fine.

I think it's also important to remember that Ruby has become much more strict about character encodings over the years so the chance of getting bad content has reduced significantly since the original issue.

}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
Expand Down
2 changes: 1 addition & 1 deletion lib/json/add/bigdecimal.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def self.json_create(object)
def as_json(*)
{
JSON.create_id => self.class.name,
'b' => _dump,
'b' => _dump.force_encoding(Encoding::UTF_8),
}
end

Expand Down
9 changes: 7 additions & 2 deletions test/json/json_generator_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,13 @@ def test_valid_utf8_in_different_encoding
wrong_encoding_string = utf8_string.b
# This behavior is historical. Not necessary desirable. We should deprecated it.
# The pure and java version of the gem already don't behave this way.
assert_equal utf8_string.to_json, wrong_encoding_string.to_json
assert_equal JSON.dump(utf8_string), JSON.dump(wrong_encoding_string)
assert_warning(/UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0/) do
assert_equal utf8_string.to_json, wrong_encoding_string.to_json
end

assert_warning(/UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0/) do
assert_equal JSON.dump(utf8_string), JSON.dump(wrong_encoding_string)
end
end

def test_string_ext_included_calls_super
Expand Down
Loading