Skip to content

Commit

Permalink
Write 4-byte characters (surrogate pairs) instead of escapes (#1335)
Browse files Browse the repository at this point in the history
  • Loading branch information
rnetuka authored Sep 18, 2024
1 parent 5c76113 commit 4d47aae
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 30 deletions.
9 changes: 9 additions & 0 deletions release-notes/CREDITS-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,15 @@ Antonin Janec (@xtonic)
* Contributed #1218: Simplify Unicode surrogate pair conversion for generation
(2.17.0)

Ian Roberts (@ianroberts)
* Reported #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding
(2.18.0)

Radovan Netuka (@rnetuka)
* Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding

Jared Stehler (@jaredstehler)
* Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization
(2.18.0)
Expand Down
4 changes: 4 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ a pure JSON library.

2.18.0 (not yet released)

#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair:
should use 4-byte encoding
(reported by Ian R)
(fix contributed by Radovan N)
#1230: Improve performance of `float` and `double` parsing from `TextBuffer`
(implemented by @pjfanning)
#1251: `InternCache` replace synchronized with `ReentrantLock` - the cache
Expand Down
15 changes: 11 additions & 4 deletions src/main/java/com/fasterxml/jackson/core/JsonGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,20 @@ public enum Feature {
WRITE_HEX_UPPER_CASE(true),

/**
* Feature that specifies whether {@link JsonGenerator} should escape forward slashes.
* <p>
* Feature is disabled by default for Jackson 2.x version, and enabled by default in Jackson 3.0.
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#ESCAPE_FORWARD_SLASHES}.
*
* @since 2.17
*/
ESCAPE_FORWARD_SLASHES(false);
ESCAPE_FORWARD_SLASHES(false),

/**
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#COMBINE_UNICODE_SURROGATES_IN_UTF8}.
*
* @since 2.18
*/
COMBINE_UNICODE_SURROGATES_IN_UTF8(false),

;

private final boolean _defaultState;
private final int _mask;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public enum JsonWriteFeature
{
// // // Support for non-standard data format constructs: comments

// // Quoting/ecsaping-related features
// // Quoting/escaping-related features

/**
* Feature that determines whether JSON Object field names are
Expand Down Expand Up @@ -117,6 +117,28 @@ public enum JsonWriteFeature
*/
ESCAPE_FORWARD_SLASHES(false, JsonGenerator.Feature.ESCAPE_FORWARD_SLASHES),

/**
* Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded
* as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} --
* should be encoded as UTF-8 by {@link JsonGenerator}.
* If enabled, surrogate pairs are combined and flushed as a
* single, 4-byte UTF-8 character.
* If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2
* separate 3-byte UTF-8 characters with values in Surrogate character ranges
* ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF})
* <p>
* Note that this feature only has effect for {@link JsonGenerator}s that directly encode
* {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]}
* and so on); it will not (can not) change handling of
* {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}).
* <p>
* Feature is disabled by default in 2.x for backwards-compatibility (will be enabled
* in 3.0).
*
* @since 2.18
*/
COMBINE_UNICODE_SURROGATES_IN_UTF8(false, JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8),

;

final private boolean _defaultState;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = cbuf[offset++];
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -1548,6 +1558,16 @@ private final void _writeStringSegment2(final String text, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = text.charAt(offset++);
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -2133,6 +2153,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
}

// @since 2.18
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10)
+ (lowSurrogate & 0x03FF);

_outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F));

return outputPtr;
}

/**
*
* @param ch
Expand Down Expand Up @@ -2214,5 +2247,10 @@ protected final void _flushBuffer() throws IOException
private byte[] getHexBytes() {
return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER;
}

// @since 2.18
private boolean _isSurrogateChar(int ch) {
return (ch & 0xD800) == 0xD800;
}
}

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.fasterxml.jackson.failing;
package com.fasterxml.jackson.core.json;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
Expand All @@ -9,10 +9,18 @@
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;

class Surrogate223Test extends JUnit5TestBase
{
private final JsonFactory JSON_F = new JsonFactory();
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();

// for [core#223]
@Test
void surrogatesDefaultSetting() throws Exception {
// default in 2.x should be disabled:
assertFalse(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.mappedFeature()));
}

// for [core#223]
@Test
Expand All @@ -23,36 +31,41 @@ void surrogatesByteBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);

JsonFactory f = JsonFactory.builder()
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();
g = f.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(out.toByteArray());
JsonParser p = f.createParser(out.toByteArray());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
f = JsonFactory.builder()
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();

g = f.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
}

// for [core#223]
// for [core#223]: no change for character-backed (cannot do anything)
@Test
void surrogatesCharBacked() throws Exception
{
Expand All @@ -61,32 +74,20 @@ void surrogatesCharBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new StringWriter();
g = JSON_F.createGenerator(out);
g = DEFAULT_JSON_F.createGenerator(out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(out.toString());
JsonParser p = DEFAULT_JSON_F.createParser(out.toString());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new StringWriter();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
}
}

0 comments on commit 4d47aae

Please sign in to comment.