diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x index 6fe06d1660..746331da55 100644 --- a/release-notes/CREDITS-2.x +++ b/release-notes/CREDITS-2.x @@ -435,6 +435,15 @@ Antonin Janec (@xtonic) * Contributed #1218: Simplify Unicode surrogate pair conversion for generation (2.17.0) +Ian Roberts (@ianroberts) + * Reported #223: `UTF8JsonGenerator` writes supplementary characters as a + surrogate pair: should use 4-byte encoding + (2.18.0) + +Radovan Netuka (@rnetuka) + * Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a + surrogate pair: should use 4-byte encoding + Jared Stehler (@jaredstehler) * Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization (2.18.0) diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index 0ba6e5336e..eaa678f1da 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -16,6 +16,10 @@ a pure JSON library. 2.18.0 (not yet released) +#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair: + should use 4-byte encoding + (reported by Ian R) + (fix contributed by Radovan N) #1230: Improve performance of `float` and `double` parsing from `TextBuffer` (implemented by @pjfanning) #1251: `InternCache` replace synchronized with `ReentrantLock` - the cache diff --git a/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java index 91451a0ea0..42ff2a9d82 100644 --- a/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java @@ -269,13 +269,20 @@ public enum Feature { WRITE_HEX_UPPER_CASE(true), /** - * Feature that specifies whether {@link JsonGenerator} should escape forward slashes. - *

- * Feature is disabled by default for Jackson 2.x version, and enabled by default in Jackson 3.0. + * See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#ESCAPE_FORWARD_SLASHES}. * * @since 2.17 */ - ESCAPE_FORWARD_SLASHES(false); + ESCAPE_FORWARD_SLASHES(false), + + /** + * See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#COMBINE_UNICODE_SURROGATES_IN_UTF8}. + * + * @since 2.18 + */ + COMBINE_UNICODE_SURROGATES_IN_UTF8(false), + + ; private final boolean _defaultState; private final int _mask; diff --git a/src/main/java/com/fasterxml/jackson/core/json/JsonWriteFeature.java b/src/main/java/com/fasterxml/jackson/core/json/JsonWriteFeature.java index f310ad9575..356a95f27e 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/JsonWriteFeature.java +++ b/src/main/java/com/fasterxml/jackson/core/json/JsonWriteFeature.java @@ -12,7 +12,7 @@ public enum JsonWriteFeature { // // // Support for non-standard data format constructs: comments - // // Quoting/ecsaping-related features + // // Quoting/escaping-related features /** * Feature that determines whether JSON Object field names are @@ -117,6 +117,28 @@ public enum JsonWriteFeature */ ESCAPE_FORWARD_SLASHES(false, JsonGenerator.Feature.ESCAPE_FORWARD_SLASHES), + /** + * Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded + * as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} -- + * should be encoded as UTF-8 by {@link JsonGenerator}. + * If enabled, surrogate pairs are combined and flushed as a + * single, 4-byte UTF-8 character. + * If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2 + * separate 3-byte UTF-8 characters with values in Surrogate character ranges + * ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF}) + *

+ * Note that this feature only has effect for {@link JsonGenerator}s that directly encode + * {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]} + * and so on); it will not (can not) change handling of + * {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}). + *

+ * Feature is disabled by default in 2.x for backwards-compatibility (will be enabled + * in 3.0). + * + * @since 2.18 + */ + COMBINE_UNICODE_SURROGATES_IN_UTF8(false, JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8), + ; final private boolean _defaultState; diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java index de47f9b48b..31076e752b 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java @@ -1510,6 +1510,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { + // 3- or 4-byte character + if (_isSurrogateChar(ch)) { + final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features); + if (combineSurrogates && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = cbuf[offset++]; + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + continue; + } + } outputPtr = _outputMultiByteChar(ch, outputPtr); } } @@ -1548,6 +1558,16 @@ private final void _writeStringSegment2(final String text, int offset, final int outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { + // 3- or 4-byte character + if (_isSurrogateChar(ch)) { + final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features); + if (combineSurrogates && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = text.charAt(offset++); + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + continue; + } + } outputPtr = _outputMultiByteChar(ch, outputPtr); } } @@ -2133,6 +2153,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f)); } + // @since 2.18 + private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) { + final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10) + + (lowSurrogate & 0x03FF); + + _outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07)); + _outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F)); + _outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F)); + _outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F)); + + return outputPtr; + } + /** * * @param ch @@ -2214,5 +2247,10 @@ protected final void _flushBuffer() throws IOException private byte[] getHexBytes() { return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER; } + + // @since 2.18 + private boolean _isSurrogateChar(int ch) { + return (ch & 0xD800) == 0xD800; + } } diff --git a/src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java b/src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java similarity index 65% rename from src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java rename to src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java index c1766cf987..38e36adba5 100644 --- a/src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java +++ b/src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java @@ -1,4 +1,4 @@ -package com.fasterxml.jackson.failing; +package com.fasterxml.jackson.core.json; import java.io.ByteArrayOutputStream; import java.io.StringWriter; @@ -9,10 +9,18 @@ import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; class Surrogate223Test extends JUnit5TestBase { - private final JsonFactory JSON_F = new JsonFactory(); + private final JsonFactory DEFAULT_JSON_F = newStreamFactory(); + + // for [core#223] + @Test + void surrogatesDefaultSetting() throws Exception { + // default in 2.x should be disabled: + assertFalse(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.mappedFeature())); + } // for [core#223] @Test @@ -23,11 +31,12 @@ void surrogatesByteBacked() throws Exception final String toQuote = new String(Character.toChars(0x1F602)); assertEquals(2, toQuote.length()); // just sanity check - // default should be disabled: -// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES)); - out = new ByteArrayOutputStream(); - g = JSON_F.createGenerator(out); + + JsonFactory f = JsonFactory.builder() + .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8) + .build(); + g = f.createGenerator(out); g.writeStartArray(); g.writeString(toQuote); g.writeEndArray(); @@ -35,16 +44,20 @@ void surrogatesByteBacked() throws Exception assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding // Also parse back to ensure correctness - JsonParser p = JSON_F.createParser(out.toByteArray()); + JsonParser p = f.createParser(out.toByteArray()); assertToken(JsonToken.START_ARRAY, p.nextToken()); assertToken(JsonToken.VALUE_STRING, p.nextToken()); + assertEquals(toQuote, p.getText()); assertToken(JsonToken.END_ARRAY, p.nextToken()); p.close(); // but may revert back to original behavior out = new ByteArrayOutputStream(); - g = JSON_F.createGenerator(out); -// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES); + f = JsonFactory.builder() + .disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8) + .build(); + + g = f.createGenerator(out); g.writeStartArray(); g.writeString(toQuote); g.writeEndArray(); @@ -52,7 +65,7 @@ void surrogatesByteBacked() throws Exception assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape } - // for [core#223] + // for [core#223]: no change for character-backed (cannot do anything) @Test void surrogatesCharBacked() throws Exception { @@ -61,11 +74,8 @@ void surrogatesCharBacked() throws Exception final String toQuote = new String(Character.toChars(0x1F602)); assertEquals(2, toQuote.length()); // just sanity check - // default should be disabled: -// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES)); - out = new StringWriter(); - g = JSON_F.createGenerator(out); + g = DEFAULT_JSON_F.createGenerator(out); g.writeStartArray(); g.writeString(toQuote); g.writeEndArray(); @@ -73,20 +83,11 @@ void surrogatesCharBacked() throws Exception assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is // Also parse back to ensure correctness - JsonParser p = JSON_F.createParser(out.toString()); + JsonParser p = DEFAULT_JSON_F.createParser(out.toString()); assertToken(JsonToken.START_ARRAY, p.nextToken()); assertToken(JsonToken.VALUE_STRING, p.nextToken()); + assertEquals(toQuote, p.getText()); assertToken(JsonToken.END_ARRAY, p.nextToken()); p.close(); - - // but may revert back to original behavior - out = new StringWriter(); - g = JSON_F.createGenerator(out); -// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES); - g.writeStartArray(); - g.writeString(toQuote); - g.writeEndArray(); - g.close(); - assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape } }