ruby · jeremyevans · Apr 11, 2022 · Mar 9, 2021 · Apr 7, 2022 · Apr 7, 2022
diff --git a/lib/net/http.rb b/lib/net/http.rb
@@ -690,6 +690,7 @@ def initialize(address, port = nil)
       @continue_timeout = nil
       @max_retries = 1
       @debug_output = nil
+      @response_body_encoding = false
 
       @proxy_from_env = false
       @proxy_uri      = nil
@@ -737,6 +738,18 @@ def set_debug_output(output)
     # The local port used to establish the connection.
     attr_accessor :local_port
 
+    # The encoding to use for the response body.  If Encoding, uses the
+    # specified encoding.  If other true value, tries to detect the response
+    # body encoding.
+    attr_reader :response_body_encoding
+
+    # Set the encoding to use for the response body.  If given a String, find
+    # the related Encoding.
+    def response_body_encoding=(value)
+      value = Encoding.find(value) if value.is_a?(String)
+      @response_body_encoding = value
+    end
+
     attr_writer :proxy_from_env
     attr_writer :proxy_address
     attr_writer :proxy_port
@@ -1566,6 +1579,7 @@ def transport_request(req)
           begin
             res = HTTPResponse.read_new(@socket)
             res.decode_content = req.decode_content
+            res.body_encoding = @response_body_encoding
           end while res.kind_of?(HTTPInformation)
 
           res.uri = req.uri

diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb
@@ -84,6 +84,7 @@ def initialize(httpv, code, msg)   #:nodoc: internal use only
     @read = false
     @uri  = nil
     @decode_content = false
+    @body_encoding = false
   end
 
   # The HTTP version supported by the server.
@@ -106,6 +107,18 @@ def initialize(httpv, code, msg)   #:nodoc: internal use only
   # Accept-Encoding header from the user.
   attr_accessor :decode_content
 
+  # The encoding to use for the response body. If Encoding, use that encoding.
+  # If other true value, attempt to detect the appropriate encoding, and use
+  # that.
+  attr_reader :body_encoding
+
+  # Set the encoding to use for the response body.  If given a String, find
+  # the related Encoding.
+  def body_encoding=(value)
+    value = Encoding.find(value) if value.is_a?(String)
+    @body_encoding = value
+  end
+
   def inspect
     "#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
   end
@@ -214,6 +227,17 @@ def read_body(dest = nil, &block)
     end
     @read = true
 
+    case enc = @body_encoding
+    when Encoding, false, nil
+      # Encoding: force given encoding
+      # false/nil: do not force encoding
+    else
+      # other value: detect encoding from body
+      enc = detect_encoding(@body)
+    end
+
+    @body.force_encoding(enc) if enc
+
     @body
   end
 
@@ -245,6 +269,141 @@ def body=(value)
 
   private
 
+  # :nodoc:
+  def detect_encoding(str, encoding=nil)
+    if encoding
+    elsif encoding = type_params['charset']
+    elsif encoding = check_bom(str)
+    else
+      encoding = case content_type&.downcase
+      when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
+        /\A<xml[ \t\r\n]+
+          version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
+          encoding[ \t\r\n]*=[ \t\r\n]*
+          (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
+        encoding = $1 || $2 || Encoding::UTF_8
+      when %r{text/html.*}
+        sniff_encoding(str)
+      end
+    end
+    return encoding
+  end
+
+  # :nodoc:
+  def sniff_encoding(str, encoding=nil)
+    # the encoding sniffing algorithm
+    # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
-    # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+    # https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm
-    # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+    # https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm
+    if enc = scanning_meta(str)
+      enc
+    # 6. last visited page or something
+    # 7. frequency
+    elsif str.ascii_only?
+      Encoding::US_ASCII
+    elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
+      Encoding::UTF_8
+    end
+    # 8. implementation-defined or user-specified
+  end
+
+  # :nodoc:
+  def check_bom(str)
+    case str.byteslice(0, 2)
+    when "\xFE\xFF"
+      return Encoding::UTF_16BE
+    when "\xFF\xFE"
+      return Encoding::UTF_16LE
+    end
+    if "\xEF\xBB\xBF" == str.byteslice(0, 3)
+      return Encoding::UTF_8
+    end
+    nil
+  end
+
+  # :nodoc:
+  def scanning_meta(str)
+    require 'strscan'
+    ss = StringScanner.new(str)
+    if ss.scan_until(/<meta[\t\n\f\r ]*/)
+      attrs = {} # attribute_list
+      got_pragma = false
+      need_pragma = nil
+      charset = nil
+
+      # step: Attributes
+      while attr = get_attribute(ss)
+        name, value = *attr
+        next if attrs[name]
+        attrs[name] = true
+        case name
+        when 'http-equiv'
+          got_pragma = true if value == 'content-type'
+        when 'content'
+          encoding = extracting_encodings_from_meta_elements(value)
+          unless charset
+            charset = encoding
+          end
+          need_pragma = true
+        when 'charset'
+          need_pragma = false
+          charset = value
+        end
+      end
+
+      # step: Processing
+      return if need_pragma.nil?
+      return if need_pragma && !got_pragma
+
+      charset = Encoding.find(charset) rescue nil
+      return unless charset
+      charset = Encoding::UTF_8 if charset == Encoding::UTF_16
+      return charset # tentative
+    end
+    nil
+  end
+
+  def get_attribute(ss)
+    ss.scan(/[\t\n\f\r \/]*/)
+    if ss.peek(1) == '>'
+      ss.getch
+      return nil
+    end
+    name = ss.scan(/[^=\t\n\f\r \/>]*/)
+    name.downcase!
+    raise if name.empty?
+    ss.skip(/[\t\n\f\r ]*/)
+    if ss.getch != '='
+      value = ''
+      return [name, value]
+    end
+    ss.skip(/[\t\n\f\r ]*/)
+    case ss.peek(1)
+    when '"'
+      ss.getch
+      value = ss.scan(/[^"]+/)
+      value.downcase!
+      ss.getch
+    when "'"
+      ss.getch
+      value = ss.scan(/[^']+/)
+      value.downcase!
+      ss.getch
+    when '>'
+      value = ''
+    else
+      value = ss.scan(/[^\t\n\f\r >]+/)
+      value.downcase!
+    end
+    [name, value]
+  end
+
+  def extracting_encodings_from_meta_elements(value)
+    # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
+    if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
+      return $1 || $2 || $3
+    end
+    return nil
+  end
+
   ##
   # Checks for a supported Content-Encoding header and yields an Inflate
   # wrapper for this response's socket when zlib is present.  If the

diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb
@@ -1243,3 +1243,57 @@ def test_bind_to_local_port
   end
 end
 
+class TestNetHTTPForceEncoding < Test::Unit::TestCase
+  CONFIG = {
+    'host' => 'localhost',
+    'proxy_host' => nil,
+    'proxy_port' => nil,
+  }
+
+  include TestNetHTTPUtils
+
+  def fe_request(force_enc, content_type=nil)
+    @server.mount_proc('/fe') do |req, res|
+      res['Content-Type'] = content_type if content_type
+      res.body = "hello\u1234"
+    end
+
+    http = Net::HTTP.new(config('host'), config('port'))
+    http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address
+    assert_not_nil(http.local_host)
+    assert_nil(http.local_port)
+
+    http.response_body_encoding = force_enc
+    http.get('/fe')
+  end
+
+  def test_response_body_encoding_false
+    res = fe_request(false)
+    assert_equal("hello\u1234".b, res.body)
+    assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
+  end
+
+  def test_response_body_encoding_true_without_content_type
+    res = fe_request(true)
+    assert_equal("hello\u1234".b, res.body)
+    assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
+  end
+
+  def test_response_body_encoding_true_with_content_type
+    res = fe_request(true, 'text/html; charset=utf-8')
+    assert_equal("hello\u1234", res.body)
+    assert_equal(Encoding::UTF_8, res.body.encoding)
+  end
+
+  def test_response_body_encoding_string_without_content_type
+    res = fe_request('utf-8')
+    assert_equal("hello\u1234", res.body)
+    assert_equal(Encoding::UTF_8, res.body.encoding)
+  end
+
+  def test_response_body_encoding_encoding_without_content_type
+    res = fe_request(Encoding::UTF_8)
+    assert_equal("hello\u1234", res.body)
+    assert_equal(Encoding::UTF_8, res.body.encoding)
+  end
+end