From 1c403168c6ce0f815316630c4b7e3bfa5d2c11e3 Mon Sep 17 00:00:00 2001 From: SamW Date: Sat, 28 Oct 2023 13:49:45 -0700 Subject: [PATCH] Update Encoding::Converter --- core/encoding.rbs | 1533 +++++++++++++++++++--------------- test/stdlib/Encoding_test.rb | 280 +++++++ 2 files changed, 1154 insertions(+), 659 deletions(-) diff --git a/core/encoding.rbs b/core/encoding.rbs index 8f211474c..84a09fad5 100644 --- a/core/encoding.rbs +++ b/core/encoding.rbs @@ -309,217 +309,428 @@ class Encoding alias to_s name ANSI_X3_4_1968: Encoding + ASCII: Encoding + ASCII_8BIT: Encoding + BIG5: Encoding + BIG5_HKSCS: Encoding + BIG5_HKSCS_2008: Encoding + BIG5_UAO: Encoding + BINARY: Encoding + Big5: Encoding + Big5_HKSCS: Encoding + Big5_HKSCS_2008: Encoding + Big5_UAO: Encoding + CESU_8: Encoding + CP1250: Encoding + CP1251: Encoding + CP1252: Encoding + CP1253: Encoding + CP1254: Encoding + CP1255: Encoding + CP1256: Encoding + CP1257: Encoding + CP1258: Encoding + CP437: Encoding + CP50220: Encoding + CP50221: Encoding + CP51932: Encoding + CP65000: Encoding + CP65001: Encoding + CP737: Encoding + CP775: Encoding + CP850: Encoding + CP852: Encoding + CP855: Encoding + CP857: Encoding + CP860: Encoding + CP861: Encoding + CP862: Encoding + CP863: Encoding + CP864: Encoding + CP865: Encoding + CP866: Encoding + CP869: Encoding + CP874: Encoding + CP878: Encoding + CP932: Encoding + CP936: Encoding + CP949: Encoding + CP950: Encoding + CP951: Encoding + CSWINDOWS31J: Encoding + CsWindows31J: Encoding + EBCDIC_CP_US: Encoding + EMACS_MULE: Encoding + EUCCN: Encoding + EUCJP: Encoding + EUCJP_MS: Encoding + EUCKR: Encoding + EUCTW: Encoding + EUC_CN: Encoding + EUC_JISX0213: Encoding + EUC_JIS_2004: Encoding + EUC_JP: Encoding + EUC_JP_MS: Encoding + EUC_KR: Encoding + EUC_TW: Encoding + Emacs_Mule: Encoding + EucCN: Encoding + EucJP: Encoding + EucJP_ms: Encoding + EucKR: Encoding + EucTW: Encoding + GB12345: Encoding + GB18030: Encoding + GB1988: Encoding + GB2312: Encoding + GBK: Encoding + IBM037: Encoding + IBM437: Encoding + IBM737: Encoding + IBM720: Encoding + CP720: Encoding + IBM775: Encoding + IBM850: Encoding + IBM852: Encoding + IBM855: Encoding + IBM857: Encoding + IBM860: Encoding + IBM861: Encoding + IBM862: Encoding + IBM863: Encoding + IBM864: Encoding + IBM865: Encoding + IBM866: Encoding + IBM869: Encoding + ISO2022_JP: Encoding + ISO2022_JP2: Encoding + ISO8859_1: Encoding + ISO8859_10: Encoding + ISO8859_11: Encoding + ISO8859_13: Encoding + ISO8859_14: Encoding + ISO8859_15: Encoding + ISO8859_16: Encoding + ISO8859_2: Encoding + ISO8859_3: Encoding + ISO8859_4: Encoding + ISO8859_5: Encoding + ISO8859_6: Encoding + ISO8859_7: Encoding + ISO8859_8: Encoding + ISO8859_9: Encoding + ISO_2022_JP: Encoding + ISO_2022_JP_2: Encoding + ISO_2022_JP_KDDI: Encoding + ISO_8859_1: Encoding + ISO_8859_10: Encoding + ISO_8859_11: Encoding + ISO_8859_13: Encoding + ISO_8859_14: Encoding + ISO_8859_15: Encoding + ISO_8859_16: Encoding + ISO_8859_2: Encoding + ISO_8859_3: Encoding + ISO_8859_4: Encoding + ISO_8859_5: Encoding + ISO_8859_6: Encoding + ISO_8859_7: Encoding + ISO_8859_8: Encoding + ISO_8859_9: Encoding + KOI8_R: Encoding + KOI8_U: Encoding + MACCENTEURO: Encoding + MACCROATIAN: Encoding + MACCYRILLIC: Encoding + MACGREEK: Encoding + MACICELAND: Encoding + MACJAPAN: Encoding + MACJAPANESE: Encoding + MACROMAN: Encoding + MACROMANIA: Encoding + MACTHAI: Encoding + MACTURKISH: Encoding + MACUKRAINE: Encoding + MacCentEuro: Encoding + MacCroatian: Encoding + MacCyrillic: Encoding + MacGreek: Encoding + MacIceland: Encoding + MacJapan: Encoding + MacJapanese: Encoding + MacRoman: Encoding + MacRomania: Encoding + MacThai: Encoding + MacTurkish: Encoding + MacUkraine: Encoding + PCK: Encoding + SHIFT_JIS: Encoding + SJIS: Encoding + SJIS_DOCOMO: Encoding + SJIS_DoCoMo: Encoding + SJIS_KDDI: Encoding + SJIS_SOFTBANK: Encoding + SJIS_SoftBank: Encoding + STATELESS_ISO_2022_JP: Encoding + STATELESS_ISO_2022_JP_KDDI: Encoding + Shift_JIS: Encoding + Stateless_ISO_2022_JP: Encoding + Stateless_ISO_2022_JP_KDDI: Encoding + TIS_620: Encoding + UCS_2BE: Encoding + UCS_4BE: Encoding + UCS_4LE: Encoding + US_ASCII: Encoding + UTF8_DOCOMO: Encoding + UTF8_DoCoMo: Encoding + UTF8_KDDI: Encoding + UTF8_MAC: Encoding + UTF8_SOFTBANK: Encoding + UTF8_SoftBank: Encoding + UTF_16: Encoding + UTF_16BE: Encoding + UTF_16LE: Encoding + UTF_32: Encoding + UTF_32BE: Encoding + UTF_32LE: Encoding + UTF_7: Encoding + UTF_8: Encoding + UTF_8_HFS: Encoding + UTF_8_MAC: Encoding + WINDOWS_1250: Encoding + WINDOWS_1251: Encoding + WINDOWS_1252: Encoding + WINDOWS_1253: Encoding + WINDOWS_1254: Encoding + WINDOWS_1255: Encoding + WINDOWS_1256: Encoding + WINDOWS_1257: Encoding + WINDOWS_1258: Encoding + WINDOWS_31J: Encoding + WINDOWS_874: Encoding + Windows_1250: Encoding + Windows_1251: Encoding + Windows_1252: Encoding + Windows_1253: Encoding + Windows_1254: Encoding + Windows_1255: Encoding + Windows_1256: Encoding + Windows_1257: Encoding + Windows_1258: Encoding + Windows_31J: Encoding + Windows_874: Encoding + # # Raised by Encoding and String methods when the source encoding is incompatible # with the target encoding. @@ -703,691 +914,695 @@ class Encoding # def source_encoding_name: () -> String end -end -# -# Encoding conversion class. -# -class Encoding::Converter < Object - type encoding = String | Encoding - type decorator = "universal_newline" - | "crlf_newline" - | "cr_newline" - | "xml_text_escape" - | "xml_attr_content_escape" - | "xml_attr_quote" - type conversion_path = Array[[encoding, encoding] | decorator] - type convert_result = :invalid_byte_sequence - | :incomplete_input - | :undefined_conversion - | :after_output - | :destination_buffer_full - | :source_buffer_empty - | :finished + # + # Encoding conversion class. + # + class Converter + type decorator = "universal_newline" + | "crlf_newline" + | "cr_newline" + | "lf_newline" + | "xml_text_escape" + | "xml_attr_content_escape" + | "xml_attr_quote" + type conversion_path = Array[[encoding, encoding] | decorator] + type convert_result = :invalid_byte_sequence + | :incomplete_input + | :undefined_conversion + | :after_output + | :destination_buffer_full + | :source_buffer_empty + | :finished + + # + # AFTER_OUTPUT + # + # Stop converting after some output is complete but before all of the input was + # consumed. See primitive_convert for an example. + # + AFTER_OUTPUT: Integer - # - # Returns the corresponding ASCII compatible encoding. - # - # Returns nil if the argument is an ASCII compatible encoding. - # - # "corresponding ASCII compatible encoding" is an ASCII compatible encoding - # which can represents exactly the same characters as the given ASCII - # incompatible encoding. So, no conversion undefined error occurs when - # converting between the two encodings. - # - # Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> # - # Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> # - # Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil - # - def self.asciicompat_encoding: (encoding enc) -> Encoding? + # + # CRLF_NEWLINE_DECORATOR + # + # Decorator for converting LF to CRLF + # + CRLF_NEWLINE_DECORATOR: Integer - # - # Returns a conversion path. - # - # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") - # #=> [[#, #], - # # [#, #]] - # - # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) - # or - # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) - # #=> [[#, #], - # # [#, #], - # # "universal_newline"] - # - # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) - # or - # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) - # #=> [[#, #], - # # "universal_newline", - # # [#, #]] - # - def self.search_convpath: ( - encoding source, - encoding destination, - ?newline: :universal | :crlf | :cr, - ?universal_newline: bool, - ?crlf_newline: bool, - ?cr_newline: bool, - ?xml: :text | :attr - ) -> conversion_path - - public + # + # CR_NEWLINE_DECORATOR + # + # Decorator for converting LF to CR + # + CR_NEWLINE_DECORATOR: Integer - # - # - def ==: (self) -> bool + # + # INVALID_MASK + # + # Mask for invalid byte sequences + # + INVALID_MASK: Integer - # - # Convert source_string and return destination_string. - # - # source_string is assumed as a part of source. i.e. :partial_input=>true is - # specified internally. finish method should be used last. - # - # ec = Encoding::Converter.new("utf-8", "euc-jp") - # puts ec.convert("\u3042").dump #=> "\xA4\xA2" - # puts ec.finish.dump #=> "" - # - # ec = Encoding::Converter.new("euc-jp", "utf-8") - # puts ec.convert("\xA4").dump #=> "" - # puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" - # puts ec.finish.dump #=> "" - # - # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - # puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") - # puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") - # puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") - # puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") - # - # If a conversion error occur, Encoding::UndefinedConversionError or - # Encoding::InvalidByteSequenceError is raised. Encoding::Converter#convert - # doesn't supply methods to recover or restart from these exceptions. When you - # want to handle these conversion errors, use - # Encoding::Converter#primitive_convert. - # - def convert: (String source) -> String + # + # INVALID_REPLACE + # + # Replace invalid byte sequences + # + INVALID_REPLACE: Integer - # - # Returns the conversion path of ec. - # - # The result is an array of conversions. - # - # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) - # p ec.convpath - # #=> [[#, #], - # # [#, #], - # # "crlf_newline"] - # - # Each element of the array is a pair of encodings or a string. A pair means an - # encoding conversion. A string means a decorator. - # - # In the above example, [#, #] means a - # converter from ISO-8859-1 to UTF-8. "crlf_newline" means newline converter - # from LF to CRLF. - # - def convpath: () -> conversion_path + # + # PARTIAL_INPUT + # + # Indicates the source may be part of a larger string. See primitive_convert + # for an example. + # + PARTIAL_INPUT: Integer - # - # Returns the destination encoding as an Encoding object. - # - def destination_encoding: () -> Encoding - - # - # Finishes the converter. It returns the last part of the converted string. - # - # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - # p ec.convert("\u3042") #=> "\e$B$\"" - # p ec.finish #=> "\e(B" - # - def finish: () -> String - - # - # Inserts string into the encoding converter. The string will be converted to - # the destination encoding and output on later conversions. - # - # If the destination encoding is stateful, string is converted according to the - # state and the state is updated. - # - # This method should be used only when a conversion error occurs. - # - # ec = Encoding::Converter.new("utf-8", "iso-8859-1") - # src = "HIRAGANA LETTER A is \u{3042}." - # dst = "" - # p ec.primitive_convert(src, dst) #=> :undefined_conversion - # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] - # ec.insert_output("") - # p ec.primitive_convert(src, dst) #=> :finished - # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is .", ""] - # - # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") - # src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp - # dst = "" - # p ec.primitive_convert(src, dst) #=> :undefined_conversion - # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] - # ec.insert_output "?" # state change required to output "?". - # p ec.primitive_convert(src, dst) #=> :finished - # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] - # - def insert_output: (String) -> nil - - # - # Returns a printable version of *ec* - # - # ec = Encoding::Converter.new("iso-8859-1", "utf-8") - # puts ec.inspect #=> # - # - def inspect: () -> String + # + # UNDEF_HEX_CHARREF + # + # Replace byte sequences that are undefined in the destination encoding with an + # XML hexadecimal character reference. This is valid for XML conversion. + # + UNDEF_HEX_CHARREF: Integer - # - # Returns an exception object for the last conversion. Returns nil if the last - # conversion did not produce an error. - # - # "error" means that Encoding::InvalidByteSequenceError and - # Encoding::UndefinedConversionError for Encoding::Converter#convert and - # :invalid_byte_sequence, :incomplete_input and :undefined_conversion for - # Encoding::Converter#primitive_convert. - # - # ec = Encoding::Converter.new("utf-8", "iso-8859-1") - # p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence - # p ec.last_error #=> # - # p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full - # p ec.last_error #=> nil - # - def last_error: () -> Encoding::InvalidByteSequenceError? - | () -> Encoding::UndefinedConversionError? + # + # UNDEF_MASK + # + # Mask for a valid character in the source encoding but no related character(s) + # in destination encoding. + # + UNDEF_MASK: Integer - # - # possible opt elements: - # hash form: - # :partial_input => true # source buffer may be part of larger source - # :after_output => true # stop conversion after output before input - # integer form: - # Encoding::Converter::PARTIAL_INPUT - # Encoding::Converter::AFTER_OUTPUT - # - # possible results: - # :invalid_byte_sequence - # :incomplete_input - # :undefined_conversion - # :after_output - # :destination_buffer_full - # :source_buffer_empty - # :finished - # - # primitive_convert converts source_buffer into destination_buffer. - # - # source_buffer should be a string or nil. nil means an empty string. - # - # destination_buffer should be a string. - # - # destination_byteoffset should be an integer or nil. nil means the end of - # destination_buffer. If it is omitted, nil is assumed. - # - # destination_bytesize should be an integer or nil. nil means unlimited. If it - # is omitted, nil is assumed. - # - # opt should be nil, a hash or an integer. nil means no flags. If it is omitted, - # nil is assumed. - # - # primitive_convert converts the content of source_buffer from beginning and - # store the result into destination_buffer. - # - # destination_byteoffset and destination_bytesize specify the region which the - # converted result is stored. destination_byteoffset specifies the start - # position in destination_buffer in bytes. If destination_byteoffset is nil, - # destination_buffer.bytesize is used for appending the result. - # destination_bytesize specifies maximum number of bytes. If - # destination_bytesize is nil, destination size is unlimited. After conversion, - # destination_buffer is resized to destination_byteoffset + actually produced - # number of bytes. Also destination_buffer's encoding is set to - # destination_encoding. - # - # primitive_convert drops the converted part of source_buffer. the dropped part - # is converted in destination_buffer or buffered in Encoding::Converter object. - # - # primitive_convert stops conversion when one of following condition met. - # * invalid byte sequence found in source buffer (:invalid_byte_sequence) - # `primitive_errinfo` and `last_error` methods returns the detail of the - # error. - # * unexpected end of source buffer (:incomplete_input) this occur only when - # :partial_input is not specified. `primitive_errinfo` and `last_error` - # methods returns the detail of the error. - # * character not representable in output encoding (:undefined_conversion) - # `primitive_errinfo` and `last_error` methods returns the detail of the - # error. - # * after some output is generated, before input is done (:after_output) this - # occur only when :after_output is specified. - # * destination buffer is full (:destination_buffer_full) this occur only when - # destination_bytesize is non-nil. - # * source buffer is empty (:source_buffer_empty) this occur only when - # :partial_input is specified. - # * conversion is finished (:finished) - # - # - # example: - # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - # ret = ec.primitive_convert(src="pi", dst="", nil, 100) - # p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] - # - # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - # ret = ec.primitive_convert(src="pi", dst="", nil, 1) - # p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] - # ret = ec.primitive_convert(src, dst="", nil, 1) - # p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] - # ret = ec.primitive_convert(src, dst="", nil, 1) - # p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] - # ret = ec.primitive_convert(src, dst="", nil, 1) - # p [ret, src, dst] #=> [:finished, "", "i"] - # - def primitive_convert: ( - String? source, - String destination, - ?Integer? destination_byteoffset, - ?Integer? destination_bytesize, - ?partial_input: bool, - ?after_output: bool - ) -> convert_result - | ( - String? source, - String destination, - ?Integer? destination_byteoffset, - ?Integer? destination_bytesize, - ?Integer opt - ) -> convert_result + # + # UNDEF_REPLACE + # + # Replace byte sequences that are undefined in the destination encoding. + # + UNDEF_REPLACE: Integer - # - # primitive_errinfo returns important information regarding the last error as a - # 5-element array: - # - # [result, enc1, enc2, error_bytes, readagain_bytes] - # - # result is the last result of primitive_convert. - # - # Other elements are only meaningful when result is :invalid_byte_sequence, - # :incomplete_input or :undefined_conversion. - # - # enc1 and enc2 indicate a conversion step as a pair of strings. For example, a - # converter from EUC-JP to ISO-8859-1 converts a string as follows: EUC-JP -> - # UTF-8 -> ISO-8859-1. So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or - # ["UTF-8", "ISO-8859-1"]. - # - # error_bytes and readagain_bytes indicate the byte sequences which caused the - # error. error_bytes is discarded portion. readagain_bytes is buffered portion - # which is read again on next conversion. - # - # Example: - # - # # \xff is invalid as EUC-JP. - # ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") - # ec.primitive_convert(src="\xff", dst="", nil, 10) - # p ec.primitive_errinfo - # #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] - # - # # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. - # # Since this error is occur in UTF-8 to ISO-8859-1 conversion, - # # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). - # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - # ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) - # p ec.primitive_errinfo - # #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] - # - # # partial character is invalid - # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - # ec.primitive_convert(src="\xa4", dst="", nil, 10) - # p ec.primitive_errinfo - # #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] - # - # # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by - # # partial characters. - # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - # ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) - # p ec.primitive_errinfo - # #=> [:source_buffer_empty, nil, nil, nil, nil] - # - # # \xd8\x00\x00@ is invalid as UTF-16BE because - # # no low surrogate after high surrogate (\xd8\x00). - # # It is detected by 3rd byte (\00) which is part of next character. - # # So the high surrogate (\xd8\x00) is discarded and - # # the 3rd byte is read again later. - # # Since the byte is buffered in ec, it is dropped from src. - # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") - # ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) - # p ec.primitive_errinfo - # #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] - # p src - # #=> "@" - # - # # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. - # # The problem is detected by 4th byte. - # ec = Encoding::Converter.new("UTF-16LE", "UTF-8") - # ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) - # p ec.primitive_errinfo - # #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] - # p src - # #=> "" - # - def primitive_errinfo: () -> [convert_result, String?, String?, String?, String?] + # + # UNIVERSAL_NEWLINE_DECORATOR + # + # Decorator for converting CRLF and CR to LF + # + UNIVERSAL_NEWLINE_DECORATOR: Integer - # - # Put back the bytes which will be converted. - # - # The bytes are caused by invalid_byte_sequence error. When - # invalid_byte_sequence error, some bytes are discarded and some bytes are - # buffered to be converted later. The latter bytes can be put back. It can be - # observed by Encoding::InvalidByteSequenceError#readagain_bytes and - # Encoding::Converter#primitive_errinfo. - # - # ec = Encoding::Converter.new("utf-16le", "iso-8859-1") - # src = "\x00\xd8\x61\x00" - # dst = "" - # p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence - # p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] - # p ec.putback #=> "a\x00" - # p ec.putback #=> "" # no more bytes to put back - # - def putback: (?Integer max_numbytes) -> String + # + # XML_ATTR_CONTENT_DECORATOR + # + # Escape as XML AttValue + # + XML_ATTR_CONTENT_DECORATOR: Integer - # - # Returns the replacement string. - # - # ec = Encoding::Converter.new("euc-jp", "us-ascii") - # p ec.replacement #=> "?" - # - # ec = Encoding::Converter.new("euc-jp", "utf-8") - # p ec.replacement #=> "\uFFFD" - # - def replacement: () -> String + # + # XML_ATTR_QUOTE_DECORATOR + # + # Escape as XML AttValue + # + XML_ATTR_QUOTE_DECORATOR: Integer - # - # Sets the replacement string. - # - # ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) - # ec.replacement = "" - # p ec.convert("a \u3042 b") #=> "a b" - # - def replacement=: (String str) -> String + # + # XML_TEXT_DECORATOR + # + # Escape as XML CharData + # + XML_TEXT_DECORATOR: Integer + # + # Returns the corresponding ASCII compatible encoding. + # + # Returns nil if the argument is an ASCII compatible encoding. + # + # "corresponding ASCII compatible encoding" is an ASCII compatible encoding + # which can represents exactly the same characters as the given ASCII + # incompatible encoding. So, no conversion undefined error occurs when + # converting between the two encodings. + # + # Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> # + # Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> # + # Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil + # + def self.asciicompat_encoding: (encoding enc) -> Encoding? - # - # Returns the source encoding as an Encoding object. - # - def source_encoding: () -> Encoding + # + # Returns a conversion path. + # + # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") + # #=> [[#, #], + # # [#, #]] + # + # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) + # or + # p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) + # #=> [[#, #], + # # [#, #], + # # "universal_newline"] + # + # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) + # or + # p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) + # #=> [[#, #], + # # "universal_newline", + # # [#, #]] + # + def self.search_convpath: (encoding source, encoding destination, ?int? flags) -> conversion_path + | ( + encoding source, + encoding destination, + ?xml: (:text | :attr)?, + ?newline: (:universal | :crlf | :cr | :lf)? # technically `newline` can be used with the others, but you get warned and stuff is ignored. + ) -> conversion_path + | ( + encoding source, + encoding destination, + ?xml: (:text | :attr)?, + ?universal_newline: boolish, + ?crlf_newline: boolish, + ?cr_newline: boolish, + ?lf_newline: boolish + ) -> conversion_path - private + # + # possible options elements: + # hash form: + # :invalid => nil # raise error on invalid byte sequence (default) + # :invalid => :replace # replace invalid byte sequence + # :undef => nil # raise error on undefined conversion (default) + # :undef => :replace # replace undefined conversion + # :replace => string # replacement string ("?" or "\uFFFD" if not specified) + # :newline => :universal # decorator for converting CRLF and CR to LF + # :newline => :lf # decorator for converting CRLF and CR to LF when writing + # :newline => :crlf # decorator for converting LF to CRLF + # :newline => :cr # decorator for converting LF to CR + # :universal_newline => true # decorator for converting CRLF and CR to LF + # :crlf_newline => true # decorator for converting LF to CRLF + # :cr_newline => true # decorator for converting LF to CR + # :lf_newline => true # decorator for converting CRLF and CR to LF when writing + # :xml => :text # escape as XML CharData. + # :xml => :attr # escape as XML AttValue + # integer form: + # Encoding::Converter::INVALID_REPLACE + # Encoding::Converter::UNDEF_REPLACE + # Encoding::Converter::UNDEF_HEX_CHARREF + # Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR + # Encoding::Converter::LF_NEWLINE_DECORATOR + # Encoding::Converter::CRLF_NEWLINE_DECORATOR + # Encoding::Converter::CR_NEWLINE_DECORATOR + # Encoding::Converter::XML_TEXT_DECORATOR + # Encoding::Converter::XML_ATTR_CONTENT_DECORATOR + # Encoding::Converter::XML_ATTR_QUOTE_DECORATOR + # + # Encoding::Converter.new creates an instance of Encoding::Converter. + # + # Source_encoding and destination_encoding should be a string or Encoding + # object. + # + # opt should be nil, a hash or an integer. + # + # convpath should be an array. convpath may contain + # * two-element arrays which contain encodings or encoding names, or + # * strings representing decorator names. + # + # + # Encoding::Converter.new optionally takes an option. The option should be a + # hash or an integer. The option hash can contain :invalid => nil, etc. The + # option integer should be logical-or of constants such as + # Encoding::Converter::INVALID_REPLACE, etc. + # + # :invalid => nil + # : Raise error on invalid byte sequence. This is a default behavior. + # :invalid => :replace + # : Replace invalid byte sequence by replacement string. + # :undef => nil + # : Raise an error if a character in source_encoding is not defined in + # destination_encoding. This is a default behavior. + # :undef => :replace + # : Replace undefined character in destination_encoding with replacement + # string. + # :replace => string + # : Specify the replacement string. If not specified, "uFFFD" is used for + # Unicode encodings and "?" for others. + # :universal_newline => true + # : Convert CRLF and CR to LF. + # :crlf_newline => true + # : Convert LF to CRLF. + # :cr_newline => true + # : Convert LF to CR. + # :lf_newline => true + # : Convert CRLF and CR to LF (when writing). + # :xml => :text + # : Escape as XML CharData. This form can be used as an HTML 4.0 #PCDATA. + # * '&' -> '&' + # * '<' -> '<' + # * '>' -> '>' + # * undefined characters in destination_encoding -> hexadecimal CharRef + # such as &#xHH; + # + # :xml => :attr + # : Escape as XML AttValue. The converted result is quoted as "...". This form + # can be used as an HTML 4.0 attribute value. + # * '&' -> '&' + # * '<' -> '<' + # * '>' -> '>' + # * '"' -> '"' + # * undefined characters in destination_encoding -> hexadecimal CharRef + # such as &#xHH; + # + # + # + # Examples: + # # UTF-16BE to UTF-8 + # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") + # + # # Usually, decorators such as newline conversion are inserted last. + # ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) + # p ec.convpath #=> [[#, #], + # # "universal_newline"] + # + # # But, if the last encoding is ASCII incompatible, + # # decorators are inserted before the last conversion. + # ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) + # p ec.convpath #=> ["crlf_newline", + # # [#, #]] + # + # # Conversion path can be specified directly. + # ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) + # p ec.convpath #=> ["universal_newline", + # # [#, #], + # # [#, #]] + # - # - # possible options elements: - # hash form: - # :invalid => nil # raise error on invalid byte sequence (default) - # :invalid => :replace # replace invalid byte sequence - # :undef => nil # raise error on undefined conversion (default) - # :undef => :replace # replace undefined conversion - # :replace => string # replacement string ("?" or "\uFFFD" if not specified) - # :newline => :universal # decorator for converting CRLF and CR to LF - # :newline => :lf # decorator for converting CRLF and CR to LF when writing - # :newline => :crlf # decorator for converting LF to CRLF - # :newline => :cr # decorator for converting LF to CR - # :universal_newline => true # decorator for converting CRLF and CR to LF - # :crlf_newline => true # decorator for converting LF to CRLF - # :cr_newline => true # decorator for converting LF to CR - # :lf_newline => true # decorator for converting CRLF and CR to LF when writing - # :xml => :text # escape as XML CharData. - # :xml => :attr # escape as XML AttValue - # integer form: - # Encoding::Converter::INVALID_REPLACE - # Encoding::Converter::UNDEF_REPLACE - # Encoding::Converter::UNDEF_HEX_CHARREF - # Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR - # Encoding::Converter::LF_NEWLINE_DECORATOR - # Encoding::Converter::CRLF_NEWLINE_DECORATOR - # Encoding::Converter::CR_NEWLINE_DECORATOR - # Encoding::Converter::XML_TEXT_DECORATOR - # Encoding::Converter::XML_ATTR_CONTENT_DECORATOR - # Encoding::Converter::XML_ATTR_QUOTE_DECORATOR - # - # Encoding::Converter.new creates an instance of Encoding::Converter. - # - # Source_encoding and destination_encoding should be a string or Encoding - # object. - # - # opt should be nil, a hash or an integer. - # - # convpath should be an array. convpath may contain - # * two-element arrays which contain encodings or encoding names, or - # * strings representing decorator names. - # - # - # Encoding::Converter.new optionally takes an option. The option should be a - # hash or an integer. The option hash can contain :invalid => nil, etc. The - # option integer should be logical-or of constants such as - # Encoding::Converter::INVALID_REPLACE, etc. - # - # :invalid => nil - # : Raise error on invalid byte sequence. This is a default behavior. - # :invalid => :replace - # : Replace invalid byte sequence by replacement string. - # :undef => nil - # : Raise an error if a character in source_encoding is not defined in - # destination_encoding. This is a default behavior. - # :undef => :replace - # : Replace undefined character in destination_encoding with replacement - # string. - # :replace => string - # : Specify the replacement string. If not specified, "uFFFD" is used for - # Unicode encodings and "?" for others. - # :universal_newline => true - # : Convert CRLF and CR to LF. - # :crlf_newline => true - # : Convert LF to CRLF. - # :cr_newline => true - # : Convert LF to CR. - # :lf_newline => true - # : Convert CRLF and CR to LF (when writing). - # :xml => :text - # : Escape as XML CharData. This form can be used as an HTML 4.0 #PCDATA. - # * '&' -> '&' - # * '<' -> '<' - # * '>' -> '>' - # * undefined characters in destination_encoding -> hexadecimal CharRef - # such as &#xHH; - # - # :xml => :attr - # : Escape as XML AttValue. The converted result is quoted as "...". This form - # can be used as an HTML 4.0 attribute value. - # * '&' -> '&' - # * '<' -> '<' - # * '>' -> '>' - # * '"' -> '"' - # * undefined characters in destination_encoding -> hexadecimal CharRef - # such as &#xHH; - # - # - # - # Examples: - # # UTF-16BE to UTF-8 - # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") - # - # # Usually, decorators such as newline conversion are inserted last. - # ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) - # p ec.convpath #=> [[#, #], - # # "universal_newline"] - # - # # But, if the last encoding is ASCII incompatible, - # # decorators are inserted before the last conversion. - # ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) - # p ec.convpath #=> ["crlf_newline", - # # [#, #]] - # - # # Conversion path can be specified directly. - # ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) - # p ec.convpath #=> ["universal_newline", - # # [#, #], - # # [#, #]] - # - def initialize: (encoding source, encoding destination) -> void - | (encoding source, encoding destination, - ?invalid: :replace | nil, - ?undef: :replace | nil, - ?replace: String, - ?newline: :universal | :crlf | :cr, - ?universal_newline: bool, - ?crlf_newline: bool, - ?cr_newline: bool, - ?xml: :text | :attr - ) -> void - | (encoding source, encoding destination, Integer opts) -> void - | (conversion_path convpath) -> void -end + def initialize: (encoding source, encoding destination, ?int? flags) -> self + | (array[[encoding, encoding] | array[encoding] | decorator | string] convpath) -> self + | ( + encoding source, + encoding destination, + ?invalid: :replace | nil, + ?undef: :replace | nil, + ?replace: string?, + ?xml: (:text | :attr)?, + ?newline: (:universal | :crlf | :cr | :lf)?, + ?universal_newline: boolish, + ?crlf_newline: boolish, + ?cr_newline: boolish, + ?lf_newline: boolish + ) -> self + # + # + def ==: (untyped rhs) -> boolish -# -# AFTER_OUTPUT -# -# Stop converting after some output is complete but before all of the input was -# consumed. See primitive_convert for an example. -# -Encoding::Converter::AFTER_OUTPUT: Integer + # + # Convert source_string and return destination_string. + # + # source_string is assumed as a part of source. i.e. :partial_input=>true is + # specified internally. finish method should be used last. + # + # ec = Encoding::Converter.new("utf-8", "euc-jp") + # puts ec.convert("\u3042").dump #=> "\xA4\xA2" + # puts ec.finish.dump #=> "" + # + # ec = Encoding::Converter.new("euc-jp", "utf-8") + # puts ec.convert("\xA4").dump #=> "" + # puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" + # puts ec.finish.dump #=> "" + # + # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") + # puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") + # puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") + # puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") + # puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") + # + # If a conversion error occur, Encoding::UndefinedConversionError or + # Encoding::InvalidByteSequenceError is raised. Encoding::Converter#convert + # doesn't supply methods to recover or restart from these exceptions. When you + # want to handle these conversion errors, use + # Encoding::Converter#primitive_convert. + # + def convert: (string source) -> String -# -# CRLF_NEWLINE_DECORATOR -# -# Decorator for converting LF to CRLF -# -Encoding::Converter::CRLF_NEWLINE_DECORATOR: Integer + # + # Returns the conversion path of ec. + # + # The result is an array of conversions. + # + # ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) + # p ec.convpath + # #=> [[#, #], + # # [#, #], + # # "crlf_newline"] + # + # Each element of the array is a pair of encodings or a string. A pair means an + # encoding conversion. A string means a decorator. + # + # In the above example, [#, #] means a + # converter from ISO-8859-1 to UTF-8. "crlf_newline" means newline converter + # from LF to CRLF. + # + def convpath: () -> Array[[Encoding, Encoding] | decorator] -# -# CR_NEWLINE_DECORATOR -# -# Decorator for converting LF to CR -# -Encoding::Converter::CR_NEWLINE_DECORATOR: Integer + # + # Returns the destination encoding as an Encoding object. + # + def destination_encoding: () -> Encoding -# -# INVALID_MASK -# -# Mask for invalid byte sequences -# -Encoding::Converter::INVALID_MASK: Integer + # + # Finishes the converter. It returns the last part of the converted string. + # + # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") + # p ec.convert("\u3042") #=> "\e$B$\"" + # p ec.finish #=> "\e(B" + # + def finish: () -> String -# -# INVALID_REPLACE -# -# Replace invalid byte sequences -# -Encoding::Converter::INVALID_REPLACE: Integer + # + # Inserts string into the encoding converter. The string will be converted to + # the destination encoding and output on later conversions. + # + # If the destination encoding is stateful, string is converted according to the + # state and the state is updated. + # + # This method should be used only when a conversion error occurs. + # + # ec = Encoding::Converter.new("utf-8", "iso-8859-1") + # src = "HIRAGANA LETTER A is \u{3042}." + # dst = "" + # p ec.primitive_convert(src, dst) #=> :undefined_conversion + # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] + # ec.insert_output("") + # p ec.primitive_convert(src, dst) #=> :finished + # puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is .", ""] + # + # ec = Encoding::Converter.new("utf-8", "iso-2022-jp") + # src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp + # dst = "" + # p ec.primitive_convert(src, dst) #=> :undefined_conversion + # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] + # ec.insert_output "?" # state change required to output "?". + # p ec.primitive_convert(src, dst) #=> :finished + # puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""] + # + def insert_output: (string str) -> nil -# -# PARTIAL_INPUT -# -# Indicates the source may be part of a larger string. See primitive_convert -# for an example. -# -Encoding::Converter::PARTIAL_INPUT: Integer + # + # Returns a printable version of *ec* + # + # ec = Encoding::Converter.new("iso-8859-1", "utf-8") + # puts ec.inspect #=> # + # + def inspect: () -> String -# -# UNDEF_HEX_CHARREF -# -# Replace byte sequences that are undefined in the destination encoding with an -# XML hexadecimal character reference. This is valid for XML conversion. -# -Encoding::Converter::UNDEF_HEX_CHARREF: Integer + # + # Returns an exception object for the last conversion. Returns nil if the last + # conversion did not produce an error. + # + # "error" means that Encoding::InvalidByteSequenceError and + # Encoding::UndefinedConversionError for Encoding::Converter#convert and + # :invalid_byte_sequence, :incomplete_input and :undefined_conversion for + # Encoding::Converter#primitive_convert. + # + # ec = Encoding::Converter.new("utf-8", "iso-8859-1") + # p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence + # p ec.last_error #=> # + # p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full + # p ec.last_error #=> nil + # + def last_error: () -> (InvalidByteSequenceError | UndefinedConversionError)? -# -# UNDEF_MASK -# -# Mask for a valid character in the source encoding but no related character(s) -# in destination encoding. -# -Encoding::Converter::UNDEF_MASK: Integer + # + # possible opt elements: + # hash form: + # :partial_input => true # source buffer may be part of larger source + # :after_output => true # stop conversion after output before input + # integer form: + # Encoding::Converter::PARTIAL_INPUT + # Encoding::Converter::AFTER_OUTPUT + # + # possible results: + # :invalid_byte_sequence + # :incomplete_input + # :undefined_conversion + # :after_output + # :destination_buffer_full + # :source_buffer_empty + # :finished + # + # primitive_convert converts source_buffer into destination_buffer. + # + # source_buffer should be a string or nil. nil means an empty string. + # + # destination_buffer should be a string. + # + # destination_byteoffset should be an integer or nil. nil means the end of + # destination_buffer. If it is omitted, nil is assumed. + # + # destination_bytesize should be an integer or nil. nil means unlimited. If it + # is omitted, nil is assumed. + # + # opt should be nil, a hash or an integer. nil means no flags. If it is omitted, + # nil is assumed. + # + # primitive_convert converts the content of source_buffer from beginning and + # store the result into destination_buffer. + # + # destination_byteoffset and destination_bytesize specify the region which the + # converted result is stored. destination_byteoffset specifies the start + # position in destination_buffer in bytes. If destination_byteoffset is nil, + # destination_buffer.bytesize is used for appending the result. + # destination_bytesize specifies maximum number of bytes. If + # destination_bytesize is nil, destination size is unlimited. After conversion, + # destination_buffer is resized to destination_byteoffset + actually produced + # number of bytes. Also destination_buffer's encoding is set to + # destination_encoding. + # + # primitive_convert drops the converted part of source_buffer. the dropped part + # is converted in destination_buffer or buffered in Encoding::Converter object. + # + # primitive_convert stops conversion when one of following condition met. + # * invalid byte sequence found in source buffer (:invalid_byte_sequence) + # `primitive_errinfo` and `last_error` methods returns the detail of the + # error. + # * unexpected end of source buffer (:incomplete_input) this occur only when + # :partial_input is not specified. `primitive_errinfo` and `last_error` + # methods returns the detail of the error. + # * character not representable in output encoding (:undefined_conversion) + # `primitive_errinfo` and `last_error` methods returns the detail of the + # error. + # * after some output is generated, before input is done (:after_output) this + # occur only when :after_output is specified. + # * destination buffer is full (:destination_buffer_full) this occur only when + # destination_bytesize is non-nil. + # * source buffer is empty (:source_buffer_empty) this occur only when + # :partial_input is specified. + # * conversion is finished (:finished) + # + # + # example: + # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") + # ret = ec.primitive_convert(src="pi", dst="", nil, 100) + # p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] + # + # ec = Encoding::Converter.new("UTF-8", "UTF-16BE") + # ret = ec.primitive_convert(src="pi", dst="", nil, 1) + # p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] + # ret = ec.primitive_convert(src, dst="", nil, 1) + # p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] + # ret = ec.primitive_convert(src, dst="", nil, 1) + # p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] + # ret = ec.primitive_convert(src, dst="", nil, 1) + # p [ret, src, dst] #=> [:finished, "", "i"] + # + def primitive_convert: ( + string? source, + string destination, + ?int? destination_byteoffset, + ?int? destination_bytesize, + ?nil, + ?partial_input: boolish, + ?after_output: boolish + ) -> convert_result + | ( + string? source, + string destination, + ?int? destination_byteoffset, + ?int? destination_bytesize, + ?int flags + ) -> convert_result -# -# UNDEF_REPLACE -# -# Replace byte sequences that are undefined in the destination encoding. -# -Encoding::Converter::UNDEF_REPLACE: Integer + # + # primitive_errinfo returns important information regarding the last error as a + # 5-element array: + # + # [result, enc1, enc2, error_bytes, readagain_bytes] + # + # result is the last result of primitive_convert. + # + # Other elements are only meaningful when result is :invalid_byte_sequence, + # :incomplete_input or :undefined_conversion. + # + # enc1 and enc2 indicate a conversion step as a pair of strings. For example, a + # converter from EUC-JP to ISO-8859-1 converts a string as follows: EUC-JP -> + # UTF-8 -> ISO-8859-1. So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or + # ["UTF-8", "ISO-8859-1"]. + # + # error_bytes and readagain_bytes indicate the byte sequences which caused the + # error. error_bytes is discarded portion. readagain_bytes is buffered portion + # which is read again on next conversion. + # + # Example: + # + # # \xff is invalid as EUC-JP. + # ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") + # ec.primitive_convert(src="\xff", dst="", nil, 10) + # p ec.primitive_errinfo + # #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] + # + # # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. + # # Since this error is occur in UTF-8 to ISO-8859-1 conversion, + # # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). + # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + # ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) + # p ec.primitive_errinfo + # #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] + # + # # partial character is invalid + # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + # ec.primitive_convert(src="\xa4", dst="", nil, 10) + # p ec.primitive_errinfo + # #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] + # + # # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by + # # partial characters. + # ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + # ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) + # p ec.primitive_errinfo + # #=> [:source_buffer_empty, nil, nil, nil, nil] + # + # # \xd8\x00\x00@ is invalid as UTF-16BE because + # # no low surrogate after high surrogate (\xd8\x00). + # # It is detected by 3rd byte (\00) which is part of next character. + # # So the high surrogate (\xd8\x00) is discarded and + # # the 3rd byte is read again later. + # # Since the byte is buffered in ec, it is dropped from src. + # ec = Encoding::Converter.new("UTF-16BE", "UTF-8") + # ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) + # p ec.primitive_errinfo + # #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] + # p src + # #=> "@" + # + # # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. + # # The problem is detected by 4th byte. + # ec = Encoding::Converter.new("UTF-16LE", "UTF-8") + # ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) + # p ec.primitive_errinfo + # #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] + # p src + # #=> "" + # + def primitive_errinfo: () -> [convert_result, String?, String?, String?, String?] -# -# UNIVERSAL_NEWLINE_DECORATOR -# -# Decorator for converting CRLF and CR to LF -# -Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR: Integer + # + # Put back the bytes which will be converted. + # + # The bytes are caused by invalid_byte_sequence error. When + # invalid_byte_sequence error, some bytes are discarded and some bytes are + # buffered to be converted later. The latter bytes can be put back. It can be + # observed by Encoding::InvalidByteSequenceError#readagain_bytes and + # Encoding::Converter#primitive_errinfo. + # + # ec = Encoding::Converter.new("utf-16le", "iso-8859-1") + # src = "\x00\xd8\x61\x00" + # dst = "" + # p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence + # p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] + # p ec.putback #=> "a\x00" + # p ec.putback #=> "" # no more bytes to put back + # + def putback: (?int? max_numbytes) -> String -# -# XML_ATTR_CONTENT_DECORATOR -# -# Escape as XML AttValue -# -Encoding::Converter::XML_ATTR_CONTENT_DECORATOR: Integer + # + # Returns the replacement string. + # + # ec = Encoding::Converter.new("euc-jp", "us-ascii") + # p ec.replacement #=> "?" + # + # ec = Encoding::Converter.new("euc-jp", "utf-8") + # p ec.replacement #=> "\uFFFD" + # + def replacement: () -> String -# -# XML_ATTR_QUOTE_DECORATOR -# -# Escape as XML AttValue -# -Encoding::Converter::XML_ATTR_QUOTE_DECORATOR: Integer + # + # Sets the replacement string. + # + # ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) + # ec.replacement = "" + # p ec.convert("a \u3042 b") #=> "a b" + # + def replacement=: [T < _ToStr] (T str) -> T -# -# XML_TEXT_DECORATOR -# -# Escape as XML CharData -# -Encoding::Converter::XML_TEXT_DECORATOR: Integer + # + # Returns the source encoding as an Encoding object. + # + def source_encoding: () -> Encoding + end +end diff --git a/test/stdlib/Encoding_test.rb b/test/stdlib/Encoding_test.rb index 2f150a043..93f508659 100644 --- a/test/stdlib/Encoding_test.rb +++ b/test/stdlib/Encoding_test.rb @@ -275,3 +275,283 @@ def error_object ec.last_error end end + +class Encoding_ConverterSingletonTest < Test::Unit::TestCase + include TypeAssertions + + testing 'singleton(::Encoding::Converter)' + + def test_asciicompat_encoding + with_encoding 'ISO-2022-JP' do |enc| + assert_send_type '(encoding) -> Encoding', + Encoding::Converter, :asciicompat_encoding, enc + end + + with_encoding 'UTF-8' do |enc| + assert_send_type '(encoding) -> nil', + Encoding::Converter, :asciicompat_encoding, enc + end + end + + def test_search_convpath + with_encoding 'ISO-8859-1' do |src| + with_encoding 'EUC-JP' do |dst| + assert_send_type '(encoding, encoding) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst + + with_int(0).chain([nil]).each do |flags| + assert_send_type '(encoding, encoding, int?) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, flags + end + + [:text, :attr, nil].each do |xml| + [:universal, :crlf, :cr, :lf, nil].each do |newline| + assert_send_type '(encoding, encoding, xml: (:text | :attr)?, newline: (:universal | :crlf | :cr | :lf)?) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, xml: xml, newline: newline + end + + # Theoretically you can pass all the kwargs at once, but i cant find a converter which'll accept more than one truthy one. + [1, nil, Object.new, false].each do |boolish| + assert_send_type '(encoding, encoding, xml: (:text | :attr)?, universal_newline: boolish) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, xml: xml, universal_newline: boolish + assert_send_type '(encoding, encoding, xml: (:text | :attr)?, crlf_newline: boolish) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, xml: xml, crlf_newline: boolish + assert_send_type '(encoding, encoding, xml: (:text | :attr)?, cr_newline: boolish) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, xml: xml, cr_newline: boolish + assert_send_type '(encoding, encoding, xml: (:text | :attr)?, lf_newline: boolish) -> ::Encoding::Converter::conversion_path', + Encoding::Converter, :search_convpath, src, dst, xml: xml, lf_newline: boolish + end + end + end + end + end + + def test_AFTER_OUTPUT + assert_const_type 'Integer', + 'Encoding::Converter::AFTER_OUTPUT' + end + + def test_CRLF_NEWLINE_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::CRLF_NEWLINE_DECORATOR' + end + + def test_CR_NEWLINE_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::CR_NEWLINE_DECORATOR' + end + + def test_INVALID_MASK + assert_const_type 'Integer', + 'Encoding::Converter::INVALID_MASK' + end + + def test_INVALID_REPLACE + assert_const_type 'Integer', + 'Encoding::Converter::INVALID_REPLACE' + end + + def test_PARTIAL_INPUT + assert_const_type 'Integer', + 'Encoding::Converter::PARTIAL_INPUT' + end + + def test_UNDEF_HEX_CHARREF + assert_const_type 'Integer', + 'Encoding::Converter::UNDEF_HEX_CHARREF' + end + + def test_UNDEF_MASK + assert_const_type 'Integer', + 'Encoding::Converter::UNDEF_MASK' + end + + def test_UNDEF_REPLACE + assert_const_type 'Integer', + 'Encoding::Converter::UNDEF_REPLACE' + end + + def test_UNIVERSAL_NEWLINE_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR' + end + + def test_XML_ATTR_CONTENT_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::XML_ATTR_CONTENT_DECORATOR' + end + + def test_XML_ATTR_QUOTE_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::XML_ATTR_QUOTE_DECORATOR' + end + + def test_XML_TEXT_DECORATOR + assert_const_type 'Integer', + 'Encoding::Converter::XML_TEXT_DECORATOR' + end +end + +class Encoding_ConverterInstanceTest < Test::Unit::TestCase + include TypeAssertions + + testing '::Encoding::Converter' + + def test_initialize + with_encoding 'ISO-8859-1' do |src| + with_encoding 'EUC-JP' do |dst| + assert_send_type '(encoding, encoding) -> void', + Encoding::Converter.allocate, :initialize, src, dst + + with_int(0).chain([nil]) do |flags| + end + end + end + # def initialize: (encoding src, encoding destination, ?int? flags) -> self + # | (array[[encoding, encoding] | array[encoding] | decorator | string] convpath) -> self + # | ( + # encoding src, + # encoding destination, + # ?invalid: :replace?, + # ?undef: :replace?, + # ?replace: string?, + # ?xml: (:text | :attr)?, + # ?newline: (:universal | :crlf | :cr | :lf)?, + # ?universal_newline: boolish, + # ?crlf_newline: boolish, + # ?cr_newline: boolish, + # ?lf_newline: boolish + # ) -> self + end + + def new_instance(src = 'ISO-8859-1', dst = 'EUC-JP') + Encoding::Converter.new(src, dst) + end + + def test_eq + [new_instance, 1, Object.new, true, false, 1r].each do |rhs| + assert_send_type '(untyped) -> boolish', + new_instance, :==, rhs + end + end + + def test_convert + with_string "A" do |string| + assert_send_type '(string) -> String', + new_instance, :convert, string + end + end + + def test_convpath + assert_send_type '() -> Array[[Encoding, Encoding] | decorator]', + new_instance, :convpath + end + + def test_destination_encoding + assert_send_type '() -> Encoding', + new_instance, :destination_encoding + end + + def test_finish + assert_send_type '() -> String', + new_instance, :finish + end + + def test_insert_output + with_string "hello" do |string| + assert_send_type '(string) -> nil', + new_instance, :insert_output, string + end + end + + def test_inspect + assert_send_type '() -> String', + new_instance, :inspect + end + + def test_last_error + assert_send_type '() -> nil', + new_instance, :last_error + + instance1 = new_instance('UTF-8', 'ISO-8859-1') + instance1.primitive_convert "\xF1abcd", '' + assert_send_type '() -> ::Encoding::InvalidByteSequenceError', + instance1, :last_error + + instance2 = new_instance('ISO-8859-1', 'EUC-JP') + instance2.primitive_convert "\xA0", '' + assert_send_type '() -> ::Encoding::UndefinedConversionError', + instance2, :last_error + end + + def test_primitive_convert + instance = new_instance + + with_string.chain([nil]).each do |src| + with_string do |dst| + assert_send_type '(string?, string) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst + + with_int(0).chain([nil]).each do |dst_offset| + assert_send_type '(string?, string, int?) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst, dst_offset + + with_int(0).chain([nil]).each do |dst_size| + assert_send_type '(string?, string, int?, int?) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst, dst_offset, dst_size + + with_int(0).each do |flags| + assert_send_type '(string?, string, int?, int?, int) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst, dst_offset, dst_size, flags + end + + [1, Object.new, nil].each do |boolish| + assert_send_type '(string?, string, int?, int?, partial_input: boolish, after_output: boolish) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst, dst_offset, dst_size, partial_input: boolish, after_output: boolish + + assert_send_type '(string?, string, int?, int?, nil, partial_input: boolish, after_output: boolish) -> ::Encoding::Converter::convert_result', + instance, :primitive_convert, src, dst, dst_offset, dst_size, nil, partial_input: boolish, after_output: boolish + end + end + end + end + end + end + + def test_primitive_errinfo + assert_send_type '() -> [::Encoding::Converter::convert_result, nil, nil, nil, nil]', + new_instance, :primitive_errinfo + + instance = new_instance('ISO-8859-1', 'EUC-JP') + instance.primitive_convert "\xA0", '' + assert_send_type '() -> [::Encoding::Converter::convert_result, String, String, String, String]', + instance, :primitive_errinfo + end + + def test_putback + assert_send_type '() -> String', + new_instance, :putback + + with_int.chain([nil]).each do |max_numbytes| + assert_send_type '(int?) -> String', + new_instance, :putback, max_numbytes + end + end + + def test_replacement + assert_send_type '() -> String', + new_instance, :replacement + end + + def test_replacement_eq + with_string do |replacement| + assert_send_type '[T < _ToStr] (T) -> T', + new_instance, :replacement=, replacement + end + end + + def test_source_encoding + assert_send_type '() -> Encoding', + new_instance, :source_encoding + end +end