Use Charset.forname, to better cache charset lookups

jhy · Jun 24, 2022 · b873e21 · b873e21
1 parent 38b3224
commit b873e21
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 14 deletions.
diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -208,7 +208,7 @@ else if (first instanceof Comment) {
             if (doc == null) {
                 if (charsetName == null)
                     charsetName = defaultCharsetName;
-                BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize); // Android level does not allow us try-with-resources
+                BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), bufferSize); // Android level does not allow us try-with-resources
                 try {
                     if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
                         long skipped = reader.skip(1);

diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java
@@ -1163,7 +1163,7 @@ else if (needsMultipart(req)) {
 
         private static void writePost(final Connection.Request req, final OutputStream outputStream, @Nullable final String boundary) throws IOException {
             final Collection<Connection.KeyVal> data = req.data();
-            final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outputStream, req.postDataCharset()));
+            final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName(req.postDataCharset())));
 
             if (boundary != null) {
                 // boundary will be set if we're in multipart mode

diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -6,6 +6,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 
@@ -37,12 +38,7 @@ private InputStream stream(String data) {
     }
 
     private InputStream stream(String data, String charset) {
-        try {
-            return new ByteArrayInputStream(data.getBytes(charset));
-        } catch (UnsupportedEncodingException e) {
-            fail();
-        }
-        return null;
+        return new ByteArrayInputStream(data.getBytes(Charset.forName(charset)));
     }
 
     @Test
@@ -180,7 +176,7 @@ public void supportsUTF8BOM() throws IOException {
 
     @Test
     public void noExtraNULLBytes() throws IOException {
-    	final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes("UTF-8");
+    	final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes(StandardCharsets.UTF_8);
 
     	Document doc = Jsoup.parse(new ByteArrayInputStream(b), null, "");
     	assertFalse( doc.outerHtml().contains("\u0000") );
@@ -201,7 +197,7 @@ public void supportsXmlCharsetDeclaration() throws IOException {
                 "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" +
                         "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" +
                         "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\">Hellö Wörld!</html>"
-        ).getBytes(encoding));
+        ).getBytes(Charset.forName(encoding)));
 
         Document doc = Jsoup.parse(soup, null, "");
         assertEquals("Hellö Wörld!", doc.body().text());

diff --git a/src/test/java/org/jsoup/parser/ParserTest.java b/src/test/java/org/jsoup/parser/ParserTest.java
@@ -6,6 +6,7 @@
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
@@ -31,7 +32,7 @@ public void unescapeEntitiesHandlesLargeInput() {
     @Test
     public void testUtf8() throws IOException {
         // testcase for https://github.com/jhy/jsoup/issues/1557. no repro.
-        Document parsed = Jsoup.parse(new ByteArrayInputStream("<p>H\u00E9llo, w\u00F6rld!".getBytes("UTF-8")), null, "");
+        Document parsed = Jsoup.parse(new ByteArrayInputStream("<p>H\u00E9llo, w\u00F6rld!".getBytes(StandardCharsets.UTF_8)), null, "");
         String text = parsed.selectFirst("p").wholeText();
         assertEquals(text, "H\u00E9llo, w\u00F6rld!");
     }

diff --git a/src/test/java/org/jsoup/parser/TokeniserTest.java b/src/test/java/org/jsoup/parser/TokeniserTest.java
@@ -5,7 +5,7 @@
 import org.jsoup.select.Elements;
 import org.junit.jupiter.api.Test;
 
-import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 
 import static org.jsoup.parser.CharacterReader.maxBufferLen;
@@ -165,9 +165,9 @@ public void bufferUpInAttributeVal() {
         assertEquals(1, parser.getErrors().size());
     }
 
-    @Test public void cp1252SubstitutionTable() throws UnsupportedEncodingException {
+    @Test public void cp1252SubstitutionTable() {
         for (int i = 0; i < Tokeniser.win1252Extensions.length; i++) {
-            String s = new String(new byte[]{ (byte) (i + Tokeniser.win1252ExtensionsStart) }, "Windows-1252");
+            String s = new String(new byte[]{ (byte) (i + Tokeniser.win1252ExtensionsStart) }, Charset.forName("Windows-1252"));
             assertEquals(1, s.length());
 
             // some of these characters are illegal