From 2903b9c0ad186591991491daebacd7c9831bebb2 Mon Sep 17 00:00:00 2001 From: "nicholas a. evans" Date: Tue, 26 Jul 2022 10:14:37 -0400 Subject: [PATCH] Add SASLprep. Code generated & tested with RFC3454 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC-4422 recommends that mechanisms SHOULD prepare simple usernames and passwords with SASLprep. And SASLprep is required by the `SCRAM-*` mechanisms, which will be added in a future PR. SASLprep is also recommended for the `PLAIN` SASL mechanism and for the `ACL` IMAP extension but—in both cases—string preparation is done by the *server*, at its own discretion. SASLprep has been officially obsoleted by PRECIS. I don't believe any IMAP RFCs have allowed replacing SASLprep with PRECIS yet. In contrast, RFC-7622 updates XMPP to require PRECIS. (See RFC-8265 for more info on PRECIS, and Section 6 for migration considerations.) Rather than create a fully generic StringPrep superclass or function, the SASLprep profile is optimized. Just enough of the generic StringPrep algorithm has been implemented to provide more detailed errors for prohibited strings. Future PRs can expand it as needed, to implement other profiles. In particular, the `trace` StringPrep profile is a requirement for clients using the `ANONYMOUS` mechanism. Many other StringPrep implementations store the tables as an array of ranges and loop over every character in the input. But using Regexp is simpler and much faster, especially where the tables closely match Unicode character classes (benchmarks are included). Some StringPrep tables use Regexps that are generated from the RFC-3454 appendices. Manually written regular expressions are used in cases where there is a close match between Unicode character classes and the SASLprep tables. All regexps are tested against the RFC tables with every valid codepoint, to verify they aren't broken if their character classes are changed by new versions of Unicode. Additionally: * Added `rake rfcs` to download many IMAP-related RFCs, for convenience. * The new code is namespaced under `Net::IMAP::SASL`. We could move the authenticators there too. If SASL funcionality is ever extracted to another gem, we can use: `Net::IMAP::SASL = Net::SASL` for backward compatibility. --- .gitignore | 1 + Rakefile | 3 + benchmarks/stringprep.yml | 65 ++++ benchmarks/table-regexps.yml | 39 +++ lib/net/imap.rb | 1 + lib/net/imap/sasl.rb | 74 +++++ lib/net/imap/sasl/saslprep.rb | 55 +++ lib/net/imap/sasl/saslprep_tables.rb | 98 ++++++ lib/net/imap/sasl/stringprep.rb | 68 ++++ lib/net/imap/sasl/stringprep_tables.rb | 153 +++++++++ rakelib/rfcs.rake | 98 ++++++ rakelib/saslprep.rake | 30 ++ rakelib/string_prep_tables_generator.rb | 423 ++++++++++++++++++++++++ test/net/imap/test_saslprep.rb | 71 ++++ test/net/imap/test_stringprep.rb | 75 +++++ 15 files changed, 1254 insertions(+) create mode 100644 benchmarks/stringprep.yml create mode 100644 benchmarks/table-regexps.yml create mode 100644 lib/net/imap/sasl.rb create mode 100644 lib/net/imap/sasl/saslprep.rb create mode 100644 lib/net/imap/sasl/saslprep_tables.rb create mode 100644 lib/net/imap/sasl/stringprep.rb create mode 100644 lib/net/imap/sasl/stringprep_tables.rb create mode 100644 rakelib/rfcs.rake create mode 100644 rakelib/saslprep.rake create mode 100644 rakelib/string_prep_tables_generator.rb create mode 100644 test/net/imap/test_saslprep.rb create mode 100644 test/net/imap/test_stringprep.rb diff --git a/.gitignore b/.gitignore index dc96cb90..cd342fb5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /coverage/ /doc/ /pkg/ +/rfcs /spec/reports/ /tmp/ /Gemfile.lock diff --git a/Rakefile b/Rakefile index 5a7afabd..90501051 100644 --- a/Rakefile +++ b/Rakefile @@ -1,5 +1,8 @@ +# frozen_string_literal: true + require "bundler/gem_tasks" require "rake/testtask" +require "rake/clean" Rake::TestTask.new(:test) do |t| t.libs << "test/lib" diff --git a/benchmarks/stringprep.yml b/benchmarks/stringprep.yml new file mode 100644 index 00000000..df253715 --- /dev/null +++ b/benchmarks/stringprep.yml @@ -0,0 +1,65 @@ +--- +prelude: | + begin + require "mongo" # gem install mongo + require "idn" # gem install idn-ruby + rescue LoadError + warn "You must 'gem install mongo idn-ruby' for this benchmark." + raise + end + + MStrPrep = Mongo::Auth::StringPrep + + # this indirection will slow it down a little bit + def mongo_saslprep(string) + MStrPrep.prepare(string, + MStrPrep::Profiles::SASL::MAPPINGS, + MStrPrep::Profiles::SASL::PROHIBITED, + normalize: true, + bidi: true) + rescue Mongo::Error::FailedStringPrepValidation + nil + end + + $LOAD_PATH.unshift "./lib" + require "net/imap" + def net_imap_saslprep(string) + Net::IMAP::SASL::SASLprep.saslprep string, exception: false + end + + def libidn_saslprep(string) + IDN::Stringprep.with_profile(string, "SASLprep") + rescue IDN::Stringprep::StringprepError + nil + end + +benchmark: + - net_imap_saslprep "I\u00ADX" # RFC example 1. IX + - net_imap_saslprep "user" # RFC example 2. user + - net_imap_saslprep "USER" # RFC example 3. user + - net_imap_saslprep "\u00aa" # RFC example 4. a + - net_imap_saslprep "\u2168" # RFC example 5. IX + - net_imap_saslprep "\u0007" # RFC example 6. Error - prohibited character + - net_imap_saslprep "\u0627\u0031" # RFC example 7. Error - bidirectional check + - net_imap_saslprep "I\u2000X" # map to space: I X + - net_imap_saslprep "a longer string, e.g. a password" + + - libidn_saslprep "I\u00ADX" # RFC example 1. IX + - libidn_saslprep "user" # RFC example 2. user + - libidn_saslprep "USER" # RFC example 3. user + - libidn_saslprep "\u00aa" # RFC example 4. a + - libidn_saslprep "\u2168" # RFC example 5. IX + - libidn_saslprep "\u0007" # RFC example 6. Error - prohibited character + - libidn_saslprep "\u0627\u0031" # RFC example 7. Error - bidirectional check + - libidn_saslprep "I\u2000X" # map to space: I X + - libidn_saslprep "a longer string, e.g. a password" + + - mongo_saslprep "I\u00ADX" # RFC example 1. IX + - mongo_saslprep "user" # RFC example 2. user + - mongo_saslprep "USER" # RFC example 3. user + - mongo_saslprep "\u00aa" # RFC example 4. a + - mongo_saslprep "\u2168" # RFC example 5. IX + - mongo_saslprep "\u0007" # RFC example 6. Error - prohibited character + - mongo_saslprep "\u0627\u0031" # RFC example 7. Error - bidirectional check + - mongo_saslprep "I\u2000X" # map to space: I X + - mongo_saslprep "a longer string, e.g. a password" diff --git a/benchmarks/table-regexps.yml b/benchmarks/table-regexps.yml new file mode 100644 index 00000000..446e7e57 --- /dev/null +++ b/benchmarks/table-regexps.yml @@ -0,0 +1,39 @@ +prelude: | + require "json" + require "set" + + all_codepoints = (0..0x10ffff).map{_1.chr("UTF-8") rescue nil}.compact + + rfc3454_tables = Dir["rfcs/rfc3454*.json"] + .first + .then{File.read _1} + .then{JSON.parse _1} + titles = rfc3454_tables.delete("titles") + + sets = rfc3454_tables + .transform_values{|t|t.keys rescue t} + .transform_values{|table| + table + .map{_1.split(?-).map{|i|Integer i, 16}} + .flat_map{_2 ? (_1.._2).to_a : _1} + .to_set + } + + TABLE_A1_SET = sets.fetch "A.1" + ASSIGNED_3_2 = /\p{AGE=3.2}/ + UNASSIGNED_3_2 = /\P{AGE=3.2}/ + TABLE_A1_REGEX = /(?-mix:[\u{0000}-\u{001f}\u{007f}-\u{00a0}\u{0340}-\u{0341}\u{06dd}\u{070f}\u{1680}\u{180e}\u{2000}-\u{200f}\u{2028}-\u{202f}\u{205f}-\u{2063}\u{206a}-\u{206f}\u{2ff0}-\u{2ffb}\u{3000}\u{e000}-\u{f8ff}\u{fdd0}-\u{fdef}\u{feff}\u{fff9}-\u{ffff}\u{1d173}-\u{1d17a}\u{1fffe}-\u{1ffff}\u{2fffe}-\u{2ffff}\u{3fffe}-\u{3ffff}\u{4fffe}-\u{4ffff}\u{5fffe}-\u{5ffff}\u{6fffe}-\u{6ffff}\u{7fffe}-\u{7ffff}\u{8fffe}-\u{8ffff}\u{9fffe}-\u{9ffff}\u{afffe}-\u{affff}\u{bfffe}-\u{bffff}\u{cfffe}-\u{cffff}\u{dfffe}-\u{dffff}\u{e0001}\u{e0020}-\u{e007f}\u{efffe}-\u{10ffff}])|(?-mix:\p{Cs})/.freeze + +benchmark: + + # matches A.1 + - script: "all_codepoints.grep(TABLE_A1_SET)" + - script: "all_codepoints.grep(TABLE_A1_REGEX)" + - script: "all_codepoints.grep(UNASSIGNED_3_2)" + - script: "all_codepoints.grep_v(ASSIGNED_3_2)" + + # doesn't match A.1 + - script: "all_codepoints.grep_v(TABLE_A1_SET)" + - script: "all_codepoints.grep_v(TABLE_A1_REGEX)" + - script: "all_codepoints.grep_v(UNASSIGNED_3_2)" + - script: "all_codepoints.grep(ASSIGNED_3_2)" diff --git a/lib/net/imap.rb b/lib/net/imap.rb index 75449fd0..ab091464 100644 --- a/lib/net/imap.rb +++ b/lib/net/imap.rb @@ -1474,3 +1474,4 @@ def start_tls_session(params = {}) require_relative "imap/response_data" require_relative "imap/response_parser" require_relative "imap/authenticators" +require_relative "imap/sasl" diff --git a/lib/net/imap/sasl.rb b/lib/net/imap/sasl.rb new file mode 100644 index 00000000..17f4c5ba --- /dev/null +++ b/lib/net/imap/sasl.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +module Net + class IMAP + + # Pluggable authentication mechanisms for protocols which support SASL + # (Simple Authentication and Security Layer), such as IMAP4, SMTP, LDAP, and + # XMPP. {RFC-4422}[https://tools.ietf.org/html/rfc4422] specifies the + # common SASL framework and the +EXTERNAL+ mechanism, and the + # {SASL mechanism registry}[https://www.iana.org/assignments/sasl-mechanisms/sasl-mechanisms.xhtml] + # lists the specification for others. + # + # "SASL is conceptually a framework that provides an abstraction layer + # between protocols and mechanisms as illustrated in the following diagram." + # + # SMTP LDAP XMPP Other protocols ... + # \ | | / + # \ | | / + # SASL abstraction layer + # / | | \ + # / | | \ + # EXTERNAL GSSAPI PLAIN Other mechanisms ... + # + module SASL + + # autoloading to avoid loading all of the regexps when they aren't used. + + autoload :StringPrep, File.expand_path("sasl/stringprep", __dir__) + autoload :SASLprep, File.expand_path("#{__dir__}/sasl/saslprep", __dir__) + + # ArgumentError raised when +string+ is invalid for the stringprep + # +profile+. + class StringPrepError < ArgumentError + attr_reader :string, :profile + + def initialize(*args, string: nil, profile: nil) + @string = -string.to_str unless string.nil? + @profile = -profile.to_str unless profile.nil? + super(*args) + end + end + + # StringPrepError raised when +string+ contains a codepoint prohibited by + # +table+. + class ProhibitedCodepoint < StringPrepError + attr_reader :table + + def initialize(table, *args, **kwargs) + @table = -table.to_str + details = (title = StringPrep::TABLE_TITLES[table]) ? + "%s [%s]" % [title, table] : table + message = "String contains a prohibited codepoint: %s" % [details] + super(message, *args, **kwargs) + end + end + + # StringPrepError raised when +string+ contains bidirectional characters + # which violate the StringPrep requirements. + class BidiStringError < StringPrepError + end + + module_function + + # See SASLprep#saslprep. + def saslprep(string, **opts) + SASLprep.saslprep(string, **opts) + end + + end + end + +end + +Net::IMAP.extend Net::IMAP::SASL diff --git a/lib/net/imap/sasl/saslprep.rb b/lib/net/imap/sasl/saslprep.rb new file mode 100644 index 00000000..35c9ed67 --- /dev/null +++ b/lib/net/imap/sasl/saslprep.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +require_relative "saslprep_tables" + +module Net::IMAP::SASL + + # SASLprep#saslprep can be used to prepare a string according to [RFC4013]. + # + # \SASLprep maps characters three ways: to nothing, to space, and Unicode + # normalization form KC. \SASLprep prohibits codepoints from nearly all + # standard StringPrep tables (RFC3454, Appendix "C"), and uses \StringPrep's + # standard bidirectional characters requirements (Appendix "D"). \SASLprep + # also uses \StringPrep's definition of "Unassigned" codepoints (Appendix "A"). + module SASLprep + + # Used to short-circuit strings that don't need preparation. + ASCII_NO_CTRLS = /\A[\x20-\x7e]*\z/u.freeze + + module_function + + # Prepares a UTF-8 +string+ for comparison, using the \SASLprep profile + # RFC4013 of the StringPrep algorithm RFC3454. + # + # By default, prohibited strings will return +nil+. When +exception+ is + # +true+, a StringPrepError describing the violation will be raised. + # + # When +stored+ is +true+, "unassigned" codepoints will be prohibited. For + # \StringPrep and the \SASLprep profile, "unassigned" refers to Unicode 3.2, + # and not later versions. See RFC3454 §7 for more information. + # + def saslprep(str, stored: false, exception: false) + return str if ASCII_NO_CTRLS.match?(str) # raises on incompatible encoding + str = str.encode("UTF-8") # also dups (and raises for invalid encoding) + str.gsub!(MAP_TO_SPACE, " ") + str.gsub!(MAP_TO_NOTHING, "") + str.unicode_normalize!(:nfkc) + # These regexps combine the prohibited and bidirectional checks + return str unless str.match?(stored ? PROHIBITED_STORED : PROHIBITED) + return nil unless exception + # raise helpful errors to indicate *why* it failed: + tables = stored ? TABLES_PROHIBITED_STORED : TABLES_PROHIBITED + StringPrep.check_prohibited! str, *tables, bidi: true, profile: "SASLprep" + raise StringPrep::InvalidStringError.new( + "unknown error", string: string, profile: "SASLprep" + ) + rescue ArgumentError, Encoding::CompatibilityError => ex + if /invalid byte sequence|incompatible encoding/.match? ex.message + return nil unless exception + raise StringPrepError.new(ex.message, string: str, profile: "saslprep") + end + raise ex + end + + end +end diff --git a/lib/net/imap/sasl/saslprep_tables.rb b/lib/net/imap/sasl/saslprep_tables.rb new file mode 100644 index 00000000..ca10fd55 --- /dev/null +++ b/lib/net/imap/sasl/saslprep_tables.rb @@ -0,0 +1,98 @@ +# frozen_string_literal: true + +#-- +# This file is generated from RFC3454, by rake. Don't edit directly. +#++ + +module Net::IMAP::SASL + + module SASLprep + + # RFC4013 §2.1 Mapping - mapped to space + # * non-ASCII space characters (\StringPrep\[\"C.1.2\"]) that can be + # mapped to SPACE (U+0020), and + # + # Equal to \StringPrep\[\"C.1.2\"]. + # Redefined here to avoid loading the StringPrep module. + MAP_TO_SPACE = /[\u200b\p{Zs}&&[^ ]]/.freeze + + # RFC4013 §2.1 Mapping - mapped to nothing + # the "commonly mapped to nothing" characters (\StringPrep\[\"B.1\"]) + # that can be mapped to nothing. + # + # Equal to \StringPrep\[\"B.1\"]. + # Redefined here to avoid loading the StringPrep module. + MAP_TO_NOTHING = /[\u{00ad 034f 1806 2060 feff}\u{180b}-\u{180d}\u{200b}-\u{200d}\u{fe00}-\u{fe0f}]/.freeze + + # RFC4013 §2.3 Prohibited Output:: + # * Non-ASCII space characters — \StringPrep\[\"C.1.2\"] + # * ASCII control characters — \StringPrep\[\"C.2.1\"] + # * Non-ASCII control characters — \StringPrep\[\"C.2.2\"] + # * Private Use characters — \StringPrep\[\"C.3\"] + # * Non-character code points — \StringPrep\[\"C.4\"] + # * Surrogate code points — \StringPrep\[\"C.5\"] + # * Inappropriate for plain text characters — \StringPrep\[\"C.6\"] + # * Inappropriate for canonical representation characters — \StringPrep\[\"C.7\"] + # * Change display properties or deprecated characters — \StringPrep\[\"C.8\"] + # * Tagging characters — \StringPrep\[\"C.9\"] + TABLES_PROHIBITED = ["C.1.2", "C.2.1", "C.2.2", "C.3", "C.4", "C.5", "C.6", "C.7", "C.8", "C.9"].freeze + + # Adds unassigned (by Unicode 3.2) codepoints to TABLES_PROHIBITED. + # + # RFC4013 §2.5 Unassigned Code Points:: + # This profile specifies the \StringPrep\[\"A.1\"] table as its list of + # unassigned code points. + TABLES_PROHIBITED_STORED = ["A.1", *TABLES_PROHIBITED].freeze + + # Matches codepoints prohibited by RFC4013 §2.3. + # + # See TABLES_PROHIBITED. + # + # Equal to +Regexp.union+ of the TABLES_PROHIBITED tables. Redefined + # here to avoid loading the StringPrep module unless necessary. + PROHIBITED_OUTPUT = /[\u{06dd 070f 1680 180e 3000 feff e0001}\u{0000}-\u{001f}\u{007f}-\u{00a0}\u{0340}-\u{0341}\u{2000}-\u{200f}\u{2028}-\u{202f}\u{205f}-\u{2063}\u{206a}-\u{206f}\u{2ff0}-\u{2ffb}\u{e000}-\u{f8ff}\u{fdd0}-\u{fdef}\u{fff9}-\u{ffff}\u{1d173}-\u{1d17a}\u{1fffe}-\u{1ffff}\u{2fffe}-\u{2ffff}\u{3fffe}-\u{3ffff}\u{4fffe}-\u{4ffff}\u{5fffe}-\u{5ffff}\u{6fffe}-\u{6ffff}\u{7fffe}-\u{7ffff}\u{8fffe}-\u{8ffff}\u{9fffe}-\u{9ffff}\u{afffe}-\u{affff}\u{bfffe}-\u{bffff}\u{cfffe}-\u{cffff}\u{dfffe}-\u{dffff}\u{e0020}-\u{e007f}\u{efffe}-\u{10ffff}\p{Cs}]/.freeze + + # RFC4013 §2.5 Unassigned Code Points:: + # This profile specifies the \StringPrep\[\"A.1\"] table as its list of + # unassigned code points. + UNASSIGNED = /\p{^AGE=3.2}/.freeze + + # Matches codepoints prohibited by RFC4013 §2.3 and §2.5. + # + # See TABLES_PROHIBITED_STORED. + PROHIBITED_OUTPUT_STORED = Regexp.union( + UNASSIGNED, PROHIBITED_OUTPUT + ).freeze + + # Bidirectional Characters [StringPrep, §6] + BIDI_FAILURE = /(?mx-i: # RandALCat followed by LCat + (?[\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}]) + .*? + (?[\u{00aa 00b5 00ba 02ee 037a 0386 038c 0589 0903 0950 09b2 09d7 0a5e 0a83 0a8d 0ac9 0ad0 0ae0 0b40 0b57 0b83 0b9c 0bd7 0cbe 0cde 0d57 0dbd 0e84 0e8a 0e8d 0ea5 0ea7 0ebd 0ec6 0f36 0f38 0f7f 0f85 0fcf 102c 1031 1038 10fb 1248 1258 1288 12b0 12c0 1310 17dc 1f59 1f5b 1f5d 1fbe 200e 2071 207f 2102 2107 2115 2124 2126 2128 2395 1d4a2 1d4bb 1d546}\u{0041}-\u{005a}\u{0061}-\u{007a}\u{00c0}-\u{00d6}\u{00d8}-\u{00f6}\u{00f8}-\u{0220}\u{0222}-\u{0233}\u{0250}-\u{02ad}\u{02b0}-\u{02b8}\u{02bb}-\u{02c1}\u{02d0}-\u{02d1}\u{02e0}-\u{02e4}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ce}\u{03d0}-\u{03f5}\u{0400}-\u{0482}\u{048a}-\u{04ce}\u{04d0}-\u{04f5}\u{04f8}-\u{04f9}\u{0500}-\u{050f}\u{0531}-\u{0556}\u{0559}-\u{055f}\u{0561}-\u{0587}\u{0905}-\u{0939}\u{093d}-\u{0940}\u{0949}-\u{094c}\u{0958}-\u{0961}\u{0964}-\u{0970}\u{0982}-\u{0983}\u{0985}-\u{098c}\u{098f}-\u{0990}\u{0993}-\u{09a8}\u{09aa}-\u{09b0}\u{09b6}-\u{09b9}\u{09be}-\u{09c0}\u{09c7}-\u{09c8}\u{09cb}-\u{09cc}\u{09dc}-\u{09dd}\u{09df}-\u{09e1}\u{09e6}-\u{09f1}\u{09f4}-\u{09fa}\u{0a05}-\u{0a0a}\u{0a0f}-\u{0a10}\u{0a13}-\u{0a28}\u{0a2a}-\u{0a30}\u{0a32}-\u{0a33}\u{0a35}-\u{0a36}\u{0a38}-\u{0a39}\u{0a3e}-\u{0a40}\u{0a59}-\u{0a5c}\u{0a66}-\u{0a6f}\u{0a72}-\u{0a74}\u{0a85}-\u{0a8b}\u{0a8f}-\u{0a91}\u{0a93}-\u{0aa8}\u{0aaa}-\u{0ab0}\u{0ab2}-\u{0ab3}\u{0ab5}-\u{0ab9}\u{0abd}-\u{0ac0}\u{0acb}-\u{0acc}\u{0ae6}-\u{0aef}\u{0b02}-\u{0b03}\u{0b05}-\u{0b0c}\u{0b0f}-\u{0b10}\u{0b13}-\u{0b28}\u{0b2a}-\u{0b30}\u{0b32}-\u{0b33}\u{0b36}-\u{0b39}\u{0b3d}-\u{0b3e}\u{0b47}-\u{0b48}\u{0b4b}-\u{0b4c}\u{0b5c}-\u{0b5d}\u{0b5f}-\u{0b61}\u{0b66}-\u{0b70}\u{0b85}-\u{0b8a}\u{0b8e}-\u{0b90}\u{0b92}-\u{0b95}\u{0b99}-\u{0b9a}\u{0b9e}-\u{0b9f}\u{0ba3}-\u{0ba4}\u{0ba8}-\u{0baa}\u{0bae}-\u{0bb5}\u{0bb7}-\u{0bb9}\u{0bbe}-\u{0bbf}\u{0bc1}-\u{0bc2}\u{0bc6}-\u{0bc8}\u{0bca}-\u{0bcc}\u{0be7}-\u{0bf2}\u{0c01}-\u{0c03}\u{0c05}-\u{0c0c}\u{0c0e}-\u{0c10}\u{0c12}-\u{0c28}\u{0c2a}-\u{0c33}\u{0c35}-\u{0c39}\u{0c41}-\u{0c44}\u{0c60}-\u{0c61}\u{0c66}-\u{0c6f}\u{0c82}-\u{0c83}\u{0c85}-\u{0c8c}\u{0c8e}-\u{0c90}\u{0c92}-\u{0ca8}\u{0caa}-\u{0cb3}\u{0cb5}-\u{0cb9}\u{0cc0}-\u{0cc4}\u{0cc7}-\u{0cc8}\u{0cca}-\u{0ccb}\u{0cd5}-\u{0cd6}\u{0ce0}-\u{0ce1}\u{0ce6}-\u{0cef}\u{0d02}-\u{0d03}\u{0d05}-\u{0d0c}\u{0d0e}-\u{0d10}\u{0d12}-\u{0d28}\u{0d2a}-\u{0d39}\u{0d3e}-\u{0d40}\u{0d46}-\u{0d48}\u{0d4a}-\u{0d4c}\u{0d60}-\u{0d61}\u{0d66}-\u{0d6f}\u{0d82}-\u{0d83}\u{0d85}-\u{0d96}\u{0d9a}-\u{0db1}\u{0db3}-\u{0dbb}\u{0dc0}-\u{0dc6}\u{0dcf}-\u{0dd1}\u{0dd8}-\u{0ddf}\u{0df2}-\u{0df4}\u{0e01}-\u{0e30}\u{0e32}-\u{0e33}\u{0e40}-\u{0e46}\u{0e4f}-\u{0e5b}\u{0e81}-\u{0e82}\u{0e87}-\u{0e88}\u{0e94}-\u{0e97}\u{0e99}-\u{0e9f}\u{0ea1}-\u{0ea3}\u{0eaa}-\u{0eab}\u{0ead}-\u{0eb0}\u{0eb2}-\u{0eb3}\u{0ec0}-\u{0ec4}\u{0ed0}-\u{0ed9}\u{0edc}-\u{0edd}\u{0f00}-\u{0f17}\u{0f1a}-\u{0f34}\u{0f3e}-\u{0f47}\u{0f49}-\u{0f6a}\u{0f88}-\u{0f8b}\u{0fbe}-\u{0fc5}\u{0fc7}-\u{0fcc}\u{1000}-\u{1021}\u{1023}-\u{1027}\u{1029}-\u{102a}\u{1040}-\u{1057}\u{10a0}-\u{10c5}\u{10d0}-\u{10f8}\u{1100}-\u{1159}\u{115f}-\u{11a2}\u{11a8}-\u{11f9}\u{1200}-\u{1206}\u{1208}-\u{1246}\u{124a}-\u{124d}\u{1250}-\u{1256}\u{125a}-\u{125d}\u{1260}-\u{1286}\u{128a}-\u{128d}\u{1290}-\u{12ae}\u{12b2}-\u{12b5}\u{12b8}-\u{12be}\u{12c2}-\u{12c5}\u{12c8}-\u{12ce}\u{12d0}-\u{12d6}\u{12d8}-\u{12ee}\u{12f0}-\u{130e}\u{1312}-\u{1315}\u{1318}-\u{131e}\u{1320}-\u{1346}\u{1348}-\u{135a}\u{1361}-\u{137c}\u{13a0}-\u{13f4}\u{1401}-\u{1676}\u{1681}-\u{169a}\u{16a0}-\u{16f0}\u{1700}-\u{170c}\u{170e}-\u{1711}\u{1720}-\u{1731}\u{1735}-\u{1736}\u{1740}-\u{1751}\u{1760}-\u{176c}\u{176e}-\u{1770}\u{1780}-\u{17b6}\u{17be}-\u{17c5}\u{17c7}-\u{17c8}\u{17d4}-\u{17da}\u{17e0}-\u{17e9}\u{1810}-\u{1819}\u{1820}-\u{1877}\u{1880}-\u{18a8}\u{1e00}-\u{1e9b}\u{1ea0}-\u{1ef9}\u{1f00}-\u{1f15}\u{1f18}-\u{1f1d}\u{1f20}-\u{1f45}\u{1f48}-\u{1f4d}\u{1f50}-\u{1f57}\u{1f5f}-\u{1f7d}\u{1f80}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd0}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe0}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{210a}-\u{2113}\u{2119}-\u{211d}\u{212a}-\u{212d}\u{212f}-\u{2131}\u{2133}-\u{2139}\u{213d}-\u{213f}\u{2145}-\u{2149}\u{2160}-\u{2183}\u{2336}-\u{237a}\u{249c}-\u{24e9}\u{3005}-\u{3007}\u{3021}-\u{3029}\u{3031}-\u{3035}\u{3038}-\u{303c}\u{3041}-\u{3096}\u{309d}-\u{309f}\u{30a1}-\u{30fa}\u{30fc}-\u{30ff}\u{3105}-\u{312c}\u{3131}-\u{318e}\u{3190}-\u{31b7}\u{31f0}-\u{321c}\u{3220}-\u{3243}\u{3260}-\u{327b}\u{327f}-\u{32b0}\u{32c0}-\u{32cb}\u{32d0}-\u{32fe}\u{3300}-\u{3376}\u{337b}-\u{33dd}\u{33e0}-\u{33fe}\u{3400}-\u{4db5}\u{4e00}-\u{9fa5}\u{a000}-\u{a48c}\u{ac00}-\u{d7a3}\u{e000}-\u{fa2d}\u{fa30}-\u{fa6a}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{ff41}-\u{ff5a}\u{ff66}-\u{ffbe}\u{ffc2}-\u{ffc7}\u{ffca}-\u{ffcf}\u{ffd2}-\u{ffd7}\u{ffda}-\u{ffdc}\u{10300}-\u{1031e}\u{10320}-\u{10323}\u{10330}-\u{1034a}\u{10400}-\u{10425}\u{10428}-\u{1044d}\u{1d000}-\u{1d0f5}\u{1d100}-\u{1d126}\u{1d12a}-\u{1d166}\u{1d16a}-\u{1d172}\u{1d183}-\u{1d184}\u{1d18c}-\u{1d1a9}\u{1d1ae}-\u{1d1dd}\u{1d400}-\u{1d454}\u{1d456}-\u{1d49c}\u{1d49e}-\u{1d49f}\u{1d4a5}-\u{1d4a6}\u{1d4a9}-\u{1d4ac}\u{1d4ae}-\u{1d4b9}\u{1d4bd}-\u{1d4c0}\u{1d4c2}-\u{1d4c3}\u{1d4c5}-\u{1d505}\u{1d507}-\u{1d50a}\u{1d50d}-\u{1d514}\u{1d516}-\u{1d51c}\u{1d51e}-\u{1d539}\u{1d53b}-\u{1d53e}\u{1d540}-\u{1d544}\u{1d54a}-\u{1d550}\u{1d552}-\u{1d6a3}\u{1d6a8}-\u{1d7c9}\u{20000}-\u{2a6d6}\u{2f800}-\u{2fa1d}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}\p{Cs}]) + | # RandALCat preceded by LCat + \g .*? \g + ) | (?mx-i: # contains RandALCat but doesn't start with RandALCat + \A(?(?-mix:[^\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}])) + .*? + g + | # contains RandALCat but doesn't end with RandALCat + \g .*? \g\z + )/mx.freeze + + # Matches strings prohibited by RFC4013 §2.3 and §2.4. + # + # This checks prohibited output and bidirectional characters. + PROHIBITED = Regexp.union( + PROHIBITED_OUTPUT, BIDI_FAILURE, + ) + + # Matches strings prohibited by RFC4013 §2.3, §2.4, and §2.5. + # + # This checks prohibited output, bidirectional characters, and + # unassigned codepoints. + PROHIBITED_STORED = Regexp.union( + PROHIBITED_OUTPUT_STORED, BIDI_FAILURE, + ) + + end +end diff --git a/lib/net/imap/sasl/stringprep.rb b/lib/net/imap/sasl/stringprep.rb new file mode 100644 index 00000000..32d25557 --- /dev/null +++ b/lib/net/imap/sasl/stringprep.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +require_relative "stringprep_tables" + +module Net::IMAP::SASL + + # Regexps and utility methods for implementing stringprep profiles. The + # \StringPrep algorithm is defined by + # {RFC-3454}[https://www.rfc-editor.org/rfc/rfc3454.html]. Each + # codepoint table defined in the RFC-3454 appendices is matched by a Regexp + # defined in this module. + #-- + # TODO: generic StringPrep mapping (not needed for SASLprep implementation) + #++ + module StringPrep + + # Returns a Regexp matching the given +table+ name. + def self.[](table) + TABLE_REGEXPS.fetch(table) + end + + module_function + + # Checks +string+ for any codepoint in +tables+. Raises a + # ProhibitedCodepoint describing the first matching table. + # + # Also checks bidirectional characters, when bidi: true, which may + # raise a BidiStringError. + def check_prohibited!(string, *tables, bidi: false, profile: nil) + tables = TABLE_TITLES.keys.grep(/^C/) if tables.empty? + tables |= %w[C.8] if bidi + table = tables.find {|t| TABLE_REGEXPS[t].match?(string) } + raise ProhibitedCodepoint.new( + table, string: string, profile: nil + ) if table + check_bidi!(string, profile: profile) if bidi + end + + # Checks that +string+ obeys all of the "Bidirectional Characters" + # requirements in RFC-3454, §6: + # + # * The characters in \StringPrep\[\"C.8\"] MUST be prohibited + # * If a string contains any RandALCat character, the string MUST NOT + # contain any LCat character. + # * If a string contains any RandALCat character, a RandALCat + # character MUST be the first character of the string, and a + # RandALCat character MUST be the last character of the string. + # + # This is usually combined with #check_prohibited!, so table "C.8" is only + # checked when +c_8: true+. + # + # Raises either ProhibitedCodepoint or BidiStringError unless all + # requirements are met. + def check_bidi!(string, c_8: false, profile: nil) + check_prohibited!(string, "C.8", profile: profile) if c_8 + if BIDI_FAILS_REQ2.match?(string) + raise BidiStringError.new( + BIDI_DESC_REQ2, string: string, profile: profile, + ) + elsif BIDI_FAILS_REQ3.match?(string) + raise BidiStringError.new( + BIDI_DESC_REQ3, string: string, profile: profile, + ) + end + end + + end +end diff --git a/lib/net/imap/sasl/stringprep_tables.rb b/lib/net/imap/sasl/stringprep_tables.rb new file mode 100644 index 00000000..ffb56028 --- /dev/null +++ b/lib/net/imap/sasl/stringprep_tables.rb @@ -0,0 +1,153 @@ +# frozen_string_literal: true + +#-- +# This file is generated from RFC3454, by rake. Don't edit directly. +#++ + +module Net::IMAP::SASL + + module StringPrep + + # Unassigned code points in Unicode 3.2 \StringPrep\[\"A.1\"] + IN_A_1 = /\p{^AGE=3.2}/.freeze + + # Commonly mapped to nothing \StringPrep\[\"B.1\"] + IN_B_1 = /[\u{00ad 034f 1806 2060 feff}\u{180b}-\u{180d}\u{200b}-\u{200d}\u{fe00}-\u{fe0f}]/.freeze + + # Mapping for case-folding used with NFKC \StringPrep\[\"B.2\"] + IN_B_2 = /[\u{00b5 0100 0102 0104 0106 0108 010a 010c 010e 0110 0112 0114 0116 0118 011a 011c 011e 0120 0122 0124 0126 0128 012a 012c 012e 0130 0132 0134 0136 0139 013b 013d 013f 0141 0143 0145 0147 014c 014e 0150 0152 0154 0156 0158 015a 015c 015e 0160 0162 0164 0166 0168 016a 016c 016e 0170 0172 0174 0176 017b 017d 017f 0184 01a2 01a4 01a9 01ac 01b5 01bc 01cd 01cf 01d1 01d3 01d5 01d7 01d9 01db 01de 01e0 01e2 01e4 01e6 01e8 01ea 01ec 01ee 01f4 01fa 01fc 01fe 0200 0202 0204 0206 0208 020a 020c 020e 0210 0212 0214 0216 0218 021a 021c 021e 0220 0222 0224 0226 0228 022a 022c 022e 0230 0232 0345 037a 0386 038c 03b0 03c2 03d8 03da 03dc 03de 03e0 03e2 03e4 03e6 03e8 03ea 03ec 03ee 0460 0462 0464 0466 0468 046a 046c 046e 0470 0472 0474 0476 0478 047a 047c 047e 0480 048a 048c 048e 0490 0492 0494 0496 0498 049a 049c 049e 04a0 04a2 04a4 04a6 04a8 04aa 04ac 04ae 04b0 04b2 04b4 04b6 04b8 04ba 04bc 04be 04c1 04c3 04c5 04c7 04c9 04cb 04cd 04d0 04d2 04d4 04d6 04d8 04da 04dc 04de 04e0 04e2 04e4 04e6 04e8 04ea 04ec 04ee 04f0 04f2 04f4 04f8 0500 0502 0504 0506 0508 050a 050c 050e 0587 1e00 1e02 1e04 1e06 1e08 1e0a 1e0c 1e0e 1e10 1e12 1e14 1e16 1e18 1e1a 1e1c 1e1e 1e20 1e22 1e24 1e26 1e28 1e2a 1e2c 1e2e 1e30 1e32 1e34 1e36 1e38 1e3a 1e3c 1e3e 1e40 1e42 1e44 1e46 1e48 1e4a 1e4c 1e4e 1e50 1e52 1e54 1e56 1e58 1e5a 1e5c 1e5e 1e60 1e62 1e64 1e66 1e68 1e6a 1e6c 1e6e 1e70 1e72 1e74 1e76 1e78 1e7a 1e7c 1e7e 1e80 1e82 1e84 1e86 1e88 1e8a 1e8c 1e8e 1e90 1e92 1e94 1ea0 1ea2 1ea4 1ea6 1ea8 1eaa 1eac 1eae 1eb0 1eb2 1eb4 1eb6 1eb8 1eba 1ebc 1ebe 1ec0 1ec2 1ec4 1ec6 1ec8 1eca 1ecc 1ece 1ed0 1ed2 1ed4 1ed6 1ed8 1eda 1edc 1ede 1ee0 1ee2 1ee4 1ee6 1ee8 1eea 1eec 1eee 1ef0 1ef2 1ef4 1ef6 1ef8 1f50 1f52 1f54 1f56 1f59 1f5b 1f5d 1f5f 1fbe 20a8 2107 2109 2124 2126 2128 2133 2145 3371 3373 3375 33c3 33cb 33d7 1d49c 1d4a2 1d546 1d6d3 1d70d 1d747 1d781 1d7bb}\u{0041}-\u{005a}\u{00c0}-\u{00d6}\u{00d8}-\u{00df}\u{0149}-\u{014a}\u{0178}-\u{0179}\u{0181}-\u{0182}\u{0186}-\u{0187}\u{0189}-\u{018b}\u{018e}-\u{0191}\u{0193}-\u{0194}\u{0196}-\u{0198}\u{019c}-\u{019d}\u{019f}-\u{01a0}\u{01a6}-\u{01a7}\u{01ae}-\u{01af}\u{01b1}-\u{01b3}\u{01b7}-\u{01b8}\u{01c4}-\u{01c5}\u{01c7}-\u{01c8}\u{01ca}-\u{01cb}\u{01f0}-\u{01f2}\u{01f6}-\u{01f8}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ab}\u{03d0}-\u{03d6}\u{03f0}-\u{03f2}\u{03f4}-\u{03f5}\u{0400}-\u{042f}\u{0531}-\u{0556}\u{1e96}-\u{1e9b}\u{1f08}-\u{1f0f}\u{1f18}-\u{1f1d}\u{1f28}-\u{1f2f}\u{1f38}-\u{1f3f}\u{1f48}-\u{1f4d}\u{1f68}-\u{1f6f}\u{1f80}-\u{1faf}\u{1fb2}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd2}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe2}-\u{1fe4}\u{1fe6}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{2102}-\u{2103}\u{210b}-\u{210d}\u{2110}-\u{2112}\u{2115}-\u{2116}\u{2119}-\u{211d}\u{2120}-\u{2122}\u{212a}-\u{212d}\u{2130}-\u{2131}\u{213e}-\u{213f}\u{2160}-\u{216f}\u{24b6}-\u{24cf}\u{3380}-\u{3387}\u{338a}-\u{338c}\u{3390}-\u{3394}\u{33a9}-\u{33ac}\u{33b4}-\u{33c1}\u{33c6}-\u{33c9}\u{33cd}-\u{33ce}\u{33d9}-\u{33da}\u{33dc}-\u{33dd}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{10400}-\u{10425}\u{1d400}-\u{1d419}\u{1d434}-\u{1d44d}\u{1d468}-\u{1d481}\u{1d49e}-\u{1d49f}\u{1d4a5}-\u{1d4a6}\u{1d4a9}-\u{1d4ac}\u{1d4ae}-\u{1d4b5}\u{1d4d0}-\u{1d4e9}\u{1d504}-\u{1d505}\u{1d507}-\u{1d50a}\u{1d50d}-\u{1d514}\u{1d516}-\u{1d51c}\u{1d538}-\u{1d539}\u{1d53b}-\u{1d53e}\u{1d540}-\u{1d544}\u{1d54a}-\u{1d550}\u{1d56c}-\u{1d585}\u{1d5a0}-\u{1d5b9}\u{1d5d4}-\u{1d5ed}\u{1d608}-\u{1d621}\u{1d63c}-\u{1d655}\u{1d670}-\u{1d689}\u{1d6a8}-\u{1d6c0}\u{1d6e2}-\u{1d6fa}\u{1d71c}-\u{1d734}\u{1d756}-\u{1d76e}\u{1d790}-\u{1d7a8}]/.freeze + + # Mapping for case-folding used with no normalization \StringPrep\[\"B.3\"] + IN_B_3 = /[\u{00b5 0100 0102 0104 0106 0108 010a 010c 010e 0110 0112 0114 0116 0118 011a 011c 011e 0120 0122 0124 0126 0128 012a 012c 012e 0130 0132 0134 0136 0139 013b 013d 013f 0141 0143 0145 0147 014c 014e 0150 0152 0154 0156 0158 015a 015c 015e 0160 0162 0164 0166 0168 016a 016c 016e 0170 0172 0174 0176 017b 017d 017f 0184 01a2 01a4 01a9 01ac 01b5 01bc 01cd 01cf 01d1 01d3 01d5 01d7 01d9 01db 01de 01e0 01e2 01e4 01e6 01e8 01ea 01ec 01ee 01f4 01fa 01fc 01fe 0200 0202 0204 0206 0208 020a 020c 020e 0210 0212 0214 0216 0218 021a 021c 021e 0220 0222 0224 0226 0228 022a 022c 022e 0230 0232 0345 0386 038c 03b0 03c2 03d8 03da 03dc 03de 03e0 03e2 03e4 03e6 03e8 03ea 03ec 03ee 0460 0462 0464 0466 0468 046a 046c 046e 0470 0472 0474 0476 0478 047a 047c 047e 0480 048a 048c 048e 0490 0492 0494 0496 0498 049a 049c 049e 04a0 04a2 04a4 04a6 04a8 04aa 04ac 04ae 04b0 04b2 04b4 04b6 04b8 04ba 04bc 04be 04c1 04c3 04c5 04c7 04c9 04cb 04cd 04d0 04d2 04d4 04d6 04d8 04da 04dc 04de 04e0 04e2 04e4 04e6 04e8 04ea 04ec 04ee 04f0 04f2 04f4 04f8 0500 0502 0504 0506 0508 050a 050c 050e 0587 1e00 1e02 1e04 1e06 1e08 1e0a 1e0c 1e0e 1e10 1e12 1e14 1e16 1e18 1e1a 1e1c 1e1e 1e20 1e22 1e24 1e26 1e28 1e2a 1e2c 1e2e 1e30 1e32 1e34 1e36 1e38 1e3a 1e3c 1e3e 1e40 1e42 1e44 1e46 1e48 1e4a 1e4c 1e4e 1e50 1e52 1e54 1e56 1e58 1e5a 1e5c 1e5e 1e60 1e62 1e64 1e66 1e68 1e6a 1e6c 1e6e 1e70 1e72 1e74 1e76 1e78 1e7a 1e7c 1e7e 1e80 1e82 1e84 1e86 1e88 1e8a 1e8c 1e8e 1e90 1e92 1e94 1ea0 1ea2 1ea4 1ea6 1ea8 1eaa 1eac 1eae 1eb0 1eb2 1eb4 1eb6 1eb8 1eba 1ebc 1ebe 1ec0 1ec2 1ec4 1ec6 1ec8 1eca 1ecc 1ece 1ed0 1ed2 1ed4 1ed6 1ed8 1eda 1edc 1ede 1ee0 1ee2 1ee4 1ee6 1ee8 1eea 1eec 1eee 1ef0 1ef2 1ef4 1ef6 1ef8 1f50 1f52 1f54 1f56 1f59 1f5b 1f5d 1f5f 1fbe 2126}\u{0041}-\u{005a}\u{00c0}-\u{00d6}\u{00d8}-\u{00df}\u{0149}-\u{014a}\u{0178}-\u{0179}\u{0181}-\u{0182}\u{0186}-\u{0187}\u{0189}-\u{018b}\u{018e}-\u{0191}\u{0193}-\u{0194}\u{0196}-\u{0198}\u{019c}-\u{019d}\u{019f}-\u{01a0}\u{01a6}-\u{01a7}\u{01ae}-\u{01af}\u{01b1}-\u{01b3}\u{01b7}-\u{01b8}\u{01c4}-\u{01c5}\u{01c7}-\u{01c8}\u{01ca}-\u{01cb}\u{01f0}-\u{01f2}\u{01f6}-\u{01f8}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ab}\u{03d0}-\u{03d1}\u{03d5}-\u{03d6}\u{03f0}-\u{03f2}\u{03f4}-\u{03f5}\u{0400}-\u{042f}\u{0531}-\u{0556}\u{1e96}-\u{1e9b}\u{1f08}-\u{1f0f}\u{1f18}-\u{1f1d}\u{1f28}-\u{1f2f}\u{1f38}-\u{1f3f}\u{1f48}-\u{1f4d}\u{1f68}-\u{1f6f}\u{1f80}-\u{1faf}\u{1fb2}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd2}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe2}-\u{1fe4}\u{1fe6}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{212a}-\u{212b}\u{2160}-\u{216f}\u{24b6}-\u{24cf}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{10400}-\u{10425}]/.freeze + + # ASCII space characters \StringPrep\[\"C.1.1\"] + IN_C_1_1 = / /.freeze + + # Non-ASCII space characters \StringPrep\[\"C.1.2\"] + IN_C_1_2 = /[\u200b\p{Zs}&&[^ ]]/.freeze + + # ASCII control characters \StringPrep\[\"C.2.1\"] + IN_C_2_1 = /[\x00-\x1f\x7f]/.freeze + + # Non-ASCII control characters \StringPrep\[\"C.2.2\"] + IN_C_2_2 = /[\u{06dd 070f 180e feff}\u{0080}-\u{009f}\u{200c}-\u{200d}\u{2028}-\u{2029}\u{2060}-\u{2063}\u{206a}-\u{206f}\u{fff9}-\u{fffc}\u{1d173}-\u{1d17a}]/.freeze + + # Private use \StringPrep\[\"C.3\"] + IN_C_3 = /\p{private use}/.freeze + + # Non-character code points \StringPrep\[\"C.4\"] + IN_C_4 = /\p{noncharacter code point}/.freeze + + # Surrogate codes \StringPrep\[\"C.5\"] + IN_C_5 = /\p{surrogate}/.freeze + + # Inappropriate for plain text \StringPrep\[\"C.6\"] + IN_C_6 = /[\p{in specials}&&\p{AGE=3.2}&&\p{^NChar}]/.freeze + + # Inappropriate for canonical representation \StringPrep\[\"C.7\"] + IN_C_7 = /[\p{in ideographic description characters}&&\p{AGE=3.2}]/.freeze + + # Change display properties or are deprecated \StringPrep\[\"C.8\"] + IN_C_8 = /[\u{0340}-\u{0341}\u{200e}-\u{200f}\u{202a}-\u{202e}\u{206a}-\u{206f}]/.freeze + + # Tagging characters \StringPrep\[\"C.9\"] + IN_C_9 = /[\p{in Tags}&&\p{AGE=3.2}]/.freeze + + # Characters with bidirectional property "R" or "AL" \StringPrep\[\"D.1\"] + IN_D_1 = /[\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}]/.freeze + + # Used to check req3 of bidirectional checks + # Matches the negation of the D.1 table + IN_D_1_NEGATED = /[^\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}]/.freeze + + # Characters with bidirectional property "L" \StringPrep\[\"D.2\"] + IN_D_2 = /[\u{00aa 00b5 00ba 02ee 037a 0386 038c 0589 0903 0950 09b2 09d7 0a5e 0a83 0a8d 0ac9 0ad0 0ae0 0b40 0b57 0b83 0b9c 0bd7 0cbe 0cde 0d57 0dbd 0e84 0e8a 0e8d 0ea5 0ea7 0ebd 0ec6 0f36 0f38 0f7f 0f85 0fcf 102c 1031 1038 10fb 1248 1258 1288 12b0 12c0 1310 17dc 1f59 1f5b 1f5d 1fbe 200e 2071 207f 2102 2107 2115 2124 2126 2128 2395 1d4a2 1d4bb 1d546}\u{0041}-\u{005a}\u{0061}-\u{007a}\u{00c0}-\u{00d6}\u{00d8}-\u{00f6}\u{00f8}-\u{0220}\u{0222}-\u{0233}\u{0250}-\u{02ad}\u{02b0}-\u{02b8}\u{02bb}-\u{02c1}\u{02d0}-\u{02d1}\u{02e0}-\u{02e4}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ce}\u{03d0}-\u{03f5}\u{0400}-\u{0482}\u{048a}-\u{04ce}\u{04d0}-\u{04f5}\u{04f8}-\u{04f9}\u{0500}-\u{050f}\u{0531}-\u{0556}\u{0559}-\u{055f}\u{0561}-\u{0587}\u{0905}-\u{0939}\u{093d}-\u{0940}\u{0949}-\u{094c}\u{0958}-\u{0961}\u{0964}-\u{0970}\u{0982}-\u{0983}\u{0985}-\u{098c}\u{098f}-\u{0990}\u{0993}-\u{09a8}\u{09aa}-\u{09b0}\u{09b6}-\u{09b9}\u{09be}-\u{09c0}\u{09c7}-\u{09c8}\u{09cb}-\u{09cc}\u{09dc}-\u{09dd}\u{09df}-\u{09e1}\u{09e6}-\u{09f1}\u{09f4}-\u{09fa}\u{0a05}-\u{0a0a}\u{0a0f}-\u{0a10}\u{0a13}-\u{0a28}\u{0a2a}-\u{0a30}\u{0a32}-\u{0a33}\u{0a35}-\u{0a36}\u{0a38}-\u{0a39}\u{0a3e}-\u{0a40}\u{0a59}-\u{0a5c}\u{0a66}-\u{0a6f}\u{0a72}-\u{0a74}\u{0a85}-\u{0a8b}\u{0a8f}-\u{0a91}\u{0a93}-\u{0aa8}\u{0aaa}-\u{0ab0}\u{0ab2}-\u{0ab3}\u{0ab5}-\u{0ab9}\u{0abd}-\u{0ac0}\u{0acb}-\u{0acc}\u{0ae6}-\u{0aef}\u{0b02}-\u{0b03}\u{0b05}-\u{0b0c}\u{0b0f}-\u{0b10}\u{0b13}-\u{0b28}\u{0b2a}-\u{0b30}\u{0b32}-\u{0b33}\u{0b36}-\u{0b39}\u{0b3d}-\u{0b3e}\u{0b47}-\u{0b48}\u{0b4b}-\u{0b4c}\u{0b5c}-\u{0b5d}\u{0b5f}-\u{0b61}\u{0b66}-\u{0b70}\u{0b85}-\u{0b8a}\u{0b8e}-\u{0b90}\u{0b92}-\u{0b95}\u{0b99}-\u{0b9a}\u{0b9e}-\u{0b9f}\u{0ba3}-\u{0ba4}\u{0ba8}-\u{0baa}\u{0bae}-\u{0bb5}\u{0bb7}-\u{0bb9}\u{0bbe}-\u{0bbf}\u{0bc1}-\u{0bc2}\u{0bc6}-\u{0bc8}\u{0bca}-\u{0bcc}\u{0be7}-\u{0bf2}\u{0c01}-\u{0c03}\u{0c05}-\u{0c0c}\u{0c0e}-\u{0c10}\u{0c12}-\u{0c28}\u{0c2a}-\u{0c33}\u{0c35}-\u{0c39}\u{0c41}-\u{0c44}\u{0c60}-\u{0c61}\u{0c66}-\u{0c6f}\u{0c82}-\u{0c83}\u{0c85}-\u{0c8c}\u{0c8e}-\u{0c90}\u{0c92}-\u{0ca8}\u{0caa}-\u{0cb3}\u{0cb5}-\u{0cb9}\u{0cc0}-\u{0cc4}\u{0cc7}-\u{0cc8}\u{0cca}-\u{0ccb}\u{0cd5}-\u{0cd6}\u{0ce0}-\u{0ce1}\u{0ce6}-\u{0cef}\u{0d02}-\u{0d03}\u{0d05}-\u{0d0c}\u{0d0e}-\u{0d10}\u{0d12}-\u{0d28}\u{0d2a}-\u{0d39}\u{0d3e}-\u{0d40}\u{0d46}-\u{0d48}\u{0d4a}-\u{0d4c}\u{0d60}-\u{0d61}\u{0d66}-\u{0d6f}\u{0d82}-\u{0d83}\u{0d85}-\u{0d96}\u{0d9a}-\u{0db1}\u{0db3}-\u{0dbb}\u{0dc0}-\u{0dc6}\u{0dcf}-\u{0dd1}\u{0dd8}-\u{0ddf}\u{0df2}-\u{0df4}\u{0e01}-\u{0e30}\u{0e32}-\u{0e33}\u{0e40}-\u{0e46}\u{0e4f}-\u{0e5b}\u{0e81}-\u{0e82}\u{0e87}-\u{0e88}\u{0e94}-\u{0e97}\u{0e99}-\u{0e9f}\u{0ea1}-\u{0ea3}\u{0eaa}-\u{0eab}\u{0ead}-\u{0eb0}\u{0eb2}-\u{0eb3}\u{0ec0}-\u{0ec4}\u{0ed0}-\u{0ed9}\u{0edc}-\u{0edd}\u{0f00}-\u{0f17}\u{0f1a}-\u{0f34}\u{0f3e}-\u{0f47}\u{0f49}-\u{0f6a}\u{0f88}-\u{0f8b}\u{0fbe}-\u{0fc5}\u{0fc7}-\u{0fcc}\u{1000}-\u{1021}\u{1023}-\u{1027}\u{1029}-\u{102a}\u{1040}-\u{1057}\u{10a0}-\u{10c5}\u{10d0}-\u{10f8}\u{1100}-\u{1159}\u{115f}-\u{11a2}\u{11a8}-\u{11f9}\u{1200}-\u{1206}\u{1208}-\u{1246}\u{124a}-\u{124d}\u{1250}-\u{1256}\u{125a}-\u{125d}\u{1260}-\u{1286}\u{128a}-\u{128d}\u{1290}-\u{12ae}\u{12b2}-\u{12b5}\u{12b8}-\u{12be}\u{12c2}-\u{12c5}\u{12c8}-\u{12ce}\u{12d0}-\u{12d6}\u{12d8}-\u{12ee}\u{12f0}-\u{130e}\u{1312}-\u{1315}\u{1318}-\u{131e}\u{1320}-\u{1346}\u{1348}-\u{135a}\u{1361}-\u{137c}\u{13a0}-\u{13f4}\u{1401}-\u{1676}\u{1681}-\u{169a}\u{16a0}-\u{16f0}\u{1700}-\u{170c}\u{170e}-\u{1711}\u{1720}-\u{1731}\u{1735}-\u{1736}\u{1740}-\u{1751}\u{1760}-\u{176c}\u{176e}-\u{1770}\u{1780}-\u{17b6}\u{17be}-\u{17c5}\u{17c7}-\u{17c8}\u{17d4}-\u{17da}\u{17e0}-\u{17e9}\u{1810}-\u{1819}\u{1820}-\u{1877}\u{1880}-\u{18a8}\u{1e00}-\u{1e9b}\u{1ea0}-\u{1ef9}\u{1f00}-\u{1f15}\u{1f18}-\u{1f1d}\u{1f20}-\u{1f45}\u{1f48}-\u{1f4d}\u{1f50}-\u{1f57}\u{1f5f}-\u{1f7d}\u{1f80}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd0}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe0}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{210a}-\u{2113}\u{2119}-\u{211d}\u{212a}-\u{212d}\u{212f}-\u{2131}\u{2133}-\u{2139}\u{213d}-\u{213f}\u{2145}-\u{2149}\u{2160}-\u{2183}\u{2336}-\u{237a}\u{249c}-\u{24e9}\u{3005}-\u{3007}\u{3021}-\u{3029}\u{3031}-\u{3035}\u{3038}-\u{303c}\u{3041}-\u{3096}\u{309d}-\u{309f}\u{30a1}-\u{30fa}\u{30fc}-\u{30ff}\u{3105}-\u{312c}\u{3131}-\u{318e}\u{3190}-\u{31b7}\u{31f0}-\u{321c}\u{3220}-\u{3243}\u{3260}-\u{327b}\u{327f}-\u{32b0}\u{32c0}-\u{32cb}\u{32d0}-\u{32fe}\u{3300}-\u{3376}\u{337b}-\u{33dd}\u{33e0}-\u{33fe}\u{3400}-\u{4db5}\u{4e00}-\u{9fa5}\u{a000}-\u{a48c}\u{ac00}-\u{d7a3}\u{e000}-\u{fa2d}\u{fa30}-\u{fa6a}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{ff41}-\u{ff5a}\u{ff66}-\u{ffbe}\u{ffc2}-\u{ffc7}\u{ffca}-\u{ffcf}\u{ffd2}-\u{ffd7}\u{ffda}-\u{ffdc}\u{10300}-\u{1031e}\u{10320}-\u{10323}\u{10330}-\u{1034a}\u{10400}-\u{10425}\u{10428}-\u{1044d}\u{1d000}-\u{1d0f5}\u{1d100}-\u{1d126}\u{1d12a}-\u{1d166}\u{1d16a}-\u{1d172}\u{1d183}-\u{1d184}\u{1d18c}-\u{1d1a9}\u{1d1ae}-\u{1d1dd}\u{1d400}-\u{1d454}\u{1d456}-\u{1d49c}\u{1d49e}-\u{1d49f}\u{1d4a5}-\u{1d4a6}\u{1d4a9}-\u{1d4ac}\u{1d4ae}-\u{1d4b9}\u{1d4bd}-\u{1d4c0}\u{1d4c2}-\u{1d4c3}\u{1d4c5}-\u{1d505}\u{1d507}-\u{1d50a}\u{1d50d}-\u{1d514}\u{1d516}-\u{1d51c}\u{1d51e}-\u{1d539}\u{1d53b}-\u{1d53e}\u{1d540}-\u{1d544}\u{1d54a}-\u{1d550}\u{1d552}-\u{1d6a3}\u{1d6a8}-\u{1d7c9}\u{20000}-\u{2a6d6}\u{2f800}-\u{2fa1d}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}\p{Cs}]/.freeze + + BIDI_DESC_REQ2 = "A string with RandALCat characters must not contain LCat characters." + + # Bidirectional Characters [StringPrep, §6], Requirement 2:: + # If a string contains any RandALCat character, the string MUST NOT + # contain any LCat character. + BIDI_FAILS_REQ2 = / # RandALCat followed by LCat + (?[\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}]) + .*? + (?[\u{00aa 00b5 00ba 02ee 037a 0386 038c 0589 0903 0950 09b2 09d7 0a5e 0a83 0a8d 0ac9 0ad0 0ae0 0b40 0b57 0b83 0b9c 0bd7 0cbe 0cde 0d57 0dbd 0e84 0e8a 0e8d 0ea5 0ea7 0ebd 0ec6 0f36 0f38 0f7f 0f85 0fcf 102c 1031 1038 10fb 1248 1258 1288 12b0 12c0 1310 17dc 1f59 1f5b 1f5d 1fbe 200e 2071 207f 2102 2107 2115 2124 2126 2128 2395 1d4a2 1d4bb 1d546}\u{0041}-\u{005a}\u{0061}-\u{007a}\u{00c0}-\u{00d6}\u{00d8}-\u{00f6}\u{00f8}-\u{0220}\u{0222}-\u{0233}\u{0250}-\u{02ad}\u{02b0}-\u{02b8}\u{02bb}-\u{02c1}\u{02d0}-\u{02d1}\u{02e0}-\u{02e4}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ce}\u{03d0}-\u{03f5}\u{0400}-\u{0482}\u{048a}-\u{04ce}\u{04d0}-\u{04f5}\u{04f8}-\u{04f9}\u{0500}-\u{050f}\u{0531}-\u{0556}\u{0559}-\u{055f}\u{0561}-\u{0587}\u{0905}-\u{0939}\u{093d}-\u{0940}\u{0949}-\u{094c}\u{0958}-\u{0961}\u{0964}-\u{0970}\u{0982}-\u{0983}\u{0985}-\u{098c}\u{098f}-\u{0990}\u{0993}-\u{09a8}\u{09aa}-\u{09b0}\u{09b6}-\u{09b9}\u{09be}-\u{09c0}\u{09c7}-\u{09c8}\u{09cb}-\u{09cc}\u{09dc}-\u{09dd}\u{09df}-\u{09e1}\u{09e6}-\u{09f1}\u{09f4}-\u{09fa}\u{0a05}-\u{0a0a}\u{0a0f}-\u{0a10}\u{0a13}-\u{0a28}\u{0a2a}-\u{0a30}\u{0a32}-\u{0a33}\u{0a35}-\u{0a36}\u{0a38}-\u{0a39}\u{0a3e}-\u{0a40}\u{0a59}-\u{0a5c}\u{0a66}-\u{0a6f}\u{0a72}-\u{0a74}\u{0a85}-\u{0a8b}\u{0a8f}-\u{0a91}\u{0a93}-\u{0aa8}\u{0aaa}-\u{0ab0}\u{0ab2}-\u{0ab3}\u{0ab5}-\u{0ab9}\u{0abd}-\u{0ac0}\u{0acb}-\u{0acc}\u{0ae6}-\u{0aef}\u{0b02}-\u{0b03}\u{0b05}-\u{0b0c}\u{0b0f}-\u{0b10}\u{0b13}-\u{0b28}\u{0b2a}-\u{0b30}\u{0b32}-\u{0b33}\u{0b36}-\u{0b39}\u{0b3d}-\u{0b3e}\u{0b47}-\u{0b48}\u{0b4b}-\u{0b4c}\u{0b5c}-\u{0b5d}\u{0b5f}-\u{0b61}\u{0b66}-\u{0b70}\u{0b85}-\u{0b8a}\u{0b8e}-\u{0b90}\u{0b92}-\u{0b95}\u{0b99}-\u{0b9a}\u{0b9e}-\u{0b9f}\u{0ba3}-\u{0ba4}\u{0ba8}-\u{0baa}\u{0bae}-\u{0bb5}\u{0bb7}-\u{0bb9}\u{0bbe}-\u{0bbf}\u{0bc1}-\u{0bc2}\u{0bc6}-\u{0bc8}\u{0bca}-\u{0bcc}\u{0be7}-\u{0bf2}\u{0c01}-\u{0c03}\u{0c05}-\u{0c0c}\u{0c0e}-\u{0c10}\u{0c12}-\u{0c28}\u{0c2a}-\u{0c33}\u{0c35}-\u{0c39}\u{0c41}-\u{0c44}\u{0c60}-\u{0c61}\u{0c66}-\u{0c6f}\u{0c82}-\u{0c83}\u{0c85}-\u{0c8c}\u{0c8e}-\u{0c90}\u{0c92}-\u{0ca8}\u{0caa}-\u{0cb3}\u{0cb5}-\u{0cb9}\u{0cc0}-\u{0cc4}\u{0cc7}-\u{0cc8}\u{0cca}-\u{0ccb}\u{0cd5}-\u{0cd6}\u{0ce0}-\u{0ce1}\u{0ce6}-\u{0cef}\u{0d02}-\u{0d03}\u{0d05}-\u{0d0c}\u{0d0e}-\u{0d10}\u{0d12}-\u{0d28}\u{0d2a}-\u{0d39}\u{0d3e}-\u{0d40}\u{0d46}-\u{0d48}\u{0d4a}-\u{0d4c}\u{0d60}-\u{0d61}\u{0d66}-\u{0d6f}\u{0d82}-\u{0d83}\u{0d85}-\u{0d96}\u{0d9a}-\u{0db1}\u{0db3}-\u{0dbb}\u{0dc0}-\u{0dc6}\u{0dcf}-\u{0dd1}\u{0dd8}-\u{0ddf}\u{0df2}-\u{0df4}\u{0e01}-\u{0e30}\u{0e32}-\u{0e33}\u{0e40}-\u{0e46}\u{0e4f}-\u{0e5b}\u{0e81}-\u{0e82}\u{0e87}-\u{0e88}\u{0e94}-\u{0e97}\u{0e99}-\u{0e9f}\u{0ea1}-\u{0ea3}\u{0eaa}-\u{0eab}\u{0ead}-\u{0eb0}\u{0eb2}-\u{0eb3}\u{0ec0}-\u{0ec4}\u{0ed0}-\u{0ed9}\u{0edc}-\u{0edd}\u{0f00}-\u{0f17}\u{0f1a}-\u{0f34}\u{0f3e}-\u{0f47}\u{0f49}-\u{0f6a}\u{0f88}-\u{0f8b}\u{0fbe}-\u{0fc5}\u{0fc7}-\u{0fcc}\u{1000}-\u{1021}\u{1023}-\u{1027}\u{1029}-\u{102a}\u{1040}-\u{1057}\u{10a0}-\u{10c5}\u{10d0}-\u{10f8}\u{1100}-\u{1159}\u{115f}-\u{11a2}\u{11a8}-\u{11f9}\u{1200}-\u{1206}\u{1208}-\u{1246}\u{124a}-\u{124d}\u{1250}-\u{1256}\u{125a}-\u{125d}\u{1260}-\u{1286}\u{128a}-\u{128d}\u{1290}-\u{12ae}\u{12b2}-\u{12b5}\u{12b8}-\u{12be}\u{12c2}-\u{12c5}\u{12c8}-\u{12ce}\u{12d0}-\u{12d6}\u{12d8}-\u{12ee}\u{12f0}-\u{130e}\u{1312}-\u{1315}\u{1318}-\u{131e}\u{1320}-\u{1346}\u{1348}-\u{135a}\u{1361}-\u{137c}\u{13a0}-\u{13f4}\u{1401}-\u{1676}\u{1681}-\u{169a}\u{16a0}-\u{16f0}\u{1700}-\u{170c}\u{170e}-\u{1711}\u{1720}-\u{1731}\u{1735}-\u{1736}\u{1740}-\u{1751}\u{1760}-\u{176c}\u{176e}-\u{1770}\u{1780}-\u{17b6}\u{17be}-\u{17c5}\u{17c7}-\u{17c8}\u{17d4}-\u{17da}\u{17e0}-\u{17e9}\u{1810}-\u{1819}\u{1820}-\u{1877}\u{1880}-\u{18a8}\u{1e00}-\u{1e9b}\u{1ea0}-\u{1ef9}\u{1f00}-\u{1f15}\u{1f18}-\u{1f1d}\u{1f20}-\u{1f45}\u{1f48}-\u{1f4d}\u{1f50}-\u{1f57}\u{1f5f}-\u{1f7d}\u{1f80}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd0}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe0}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{210a}-\u{2113}\u{2119}-\u{211d}\u{212a}-\u{212d}\u{212f}-\u{2131}\u{2133}-\u{2139}\u{213d}-\u{213f}\u{2145}-\u{2149}\u{2160}-\u{2183}\u{2336}-\u{237a}\u{249c}-\u{24e9}\u{3005}-\u{3007}\u{3021}-\u{3029}\u{3031}-\u{3035}\u{3038}-\u{303c}\u{3041}-\u{3096}\u{309d}-\u{309f}\u{30a1}-\u{30fa}\u{30fc}-\u{30ff}\u{3105}-\u{312c}\u{3131}-\u{318e}\u{3190}-\u{31b7}\u{31f0}-\u{321c}\u{3220}-\u{3243}\u{3260}-\u{327b}\u{327f}-\u{32b0}\u{32c0}-\u{32cb}\u{32d0}-\u{32fe}\u{3300}-\u{3376}\u{337b}-\u{33dd}\u{33e0}-\u{33fe}\u{3400}-\u{4db5}\u{4e00}-\u{9fa5}\u{a000}-\u{a48c}\u{ac00}-\u{d7a3}\u{e000}-\u{fa2d}\u{fa30}-\u{fa6a}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{ff41}-\u{ff5a}\u{ff66}-\u{ffbe}\u{ffc2}-\u{ffc7}\u{ffca}-\u{ffcf}\u{ffd2}-\u{ffd7}\u{ffda}-\u{ffdc}\u{10300}-\u{1031e}\u{10320}-\u{10323}\u{10330}-\u{1034a}\u{10400}-\u{10425}\u{10428}-\u{1044d}\u{1d000}-\u{1d0f5}\u{1d100}-\u{1d126}\u{1d12a}-\u{1d166}\u{1d16a}-\u{1d172}\u{1d183}-\u{1d184}\u{1d18c}-\u{1d1a9}\u{1d1ae}-\u{1d1dd}\u{1d400}-\u{1d454}\u{1d456}-\u{1d49c}\u{1d49e}-\u{1d49f}\u{1d4a5}-\u{1d4a6}\u{1d4a9}-\u{1d4ac}\u{1d4ae}-\u{1d4b9}\u{1d4bd}-\u{1d4c0}\u{1d4c2}-\u{1d4c3}\u{1d4c5}-\u{1d505}\u{1d507}-\u{1d50a}\u{1d50d}-\u{1d514}\u{1d516}-\u{1d51c}\u{1d51e}-\u{1d539}\u{1d53b}-\u{1d53e}\u{1d540}-\u{1d544}\u{1d54a}-\u{1d550}\u{1d552}-\u{1d6a3}\u{1d6a8}-\u{1d7c9}\u{20000}-\u{2a6d6}\u{2f800}-\u{2fa1d}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}\p{Cs}]) + | # RandALCat preceded by LCat + \g .*? \g + /mx.freeze + + BIDI_DESC_REQ3 = "A string with RandALCat characters must start and end with RandALCat characters." + + # Bidirectional Characters [StringPrep, §6], Requirement 3:: + # If a string contains any RandALCat character, a RandALCat + # character MUST be the first character of the string, and a + # RandALCat character MUST be the last character of the string. + BIDI_FAILS_REQ3 = / # contains RandALCat but doesn't start with RandALCat + \A(?(?-mix:[^\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}])) + .*? + (?(?-mix:[\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}])) + | # contains RandALCat but doesn't end with RandALCat + \g .*? \g\z + /mx.freeze + + # Bidirectional Characters [StringPrep, §6] + BIDI_FAILURE = /(?mx-i: # RandALCat followed by LCat + (?[\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}]) + .*? + (?[\u{00aa 00b5 00ba 02ee 037a 0386 038c 0589 0903 0950 09b2 09d7 0a5e 0a83 0a8d 0ac9 0ad0 0ae0 0b40 0b57 0b83 0b9c 0bd7 0cbe 0cde 0d57 0dbd 0e84 0e8a 0e8d 0ea5 0ea7 0ebd 0ec6 0f36 0f38 0f7f 0f85 0fcf 102c 1031 1038 10fb 1248 1258 1288 12b0 12c0 1310 17dc 1f59 1f5b 1f5d 1fbe 200e 2071 207f 2102 2107 2115 2124 2126 2128 2395 1d4a2 1d4bb 1d546}\u{0041}-\u{005a}\u{0061}-\u{007a}\u{00c0}-\u{00d6}\u{00d8}-\u{00f6}\u{00f8}-\u{0220}\u{0222}-\u{0233}\u{0250}-\u{02ad}\u{02b0}-\u{02b8}\u{02bb}-\u{02c1}\u{02d0}-\u{02d1}\u{02e0}-\u{02e4}\u{0388}-\u{038a}\u{038e}-\u{03a1}\u{03a3}-\u{03ce}\u{03d0}-\u{03f5}\u{0400}-\u{0482}\u{048a}-\u{04ce}\u{04d0}-\u{04f5}\u{04f8}-\u{04f9}\u{0500}-\u{050f}\u{0531}-\u{0556}\u{0559}-\u{055f}\u{0561}-\u{0587}\u{0905}-\u{0939}\u{093d}-\u{0940}\u{0949}-\u{094c}\u{0958}-\u{0961}\u{0964}-\u{0970}\u{0982}-\u{0983}\u{0985}-\u{098c}\u{098f}-\u{0990}\u{0993}-\u{09a8}\u{09aa}-\u{09b0}\u{09b6}-\u{09b9}\u{09be}-\u{09c0}\u{09c7}-\u{09c8}\u{09cb}-\u{09cc}\u{09dc}-\u{09dd}\u{09df}-\u{09e1}\u{09e6}-\u{09f1}\u{09f4}-\u{09fa}\u{0a05}-\u{0a0a}\u{0a0f}-\u{0a10}\u{0a13}-\u{0a28}\u{0a2a}-\u{0a30}\u{0a32}-\u{0a33}\u{0a35}-\u{0a36}\u{0a38}-\u{0a39}\u{0a3e}-\u{0a40}\u{0a59}-\u{0a5c}\u{0a66}-\u{0a6f}\u{0a72}-\u{0a74}\u{0a85}-\u{0a8b}\u{0a8f}-\u{0a91}\u{0a93}-\u{0aa8}\u{0aaa}-\u{0ab0}\u{0ab2}-\u{0ab3}\u{0ab5}-\u{0ab9}\u{0abd}-\u{0ac0}\u{0acb}-\u{0acc}\u{0ae6}-\u{0aef}\u{0b02}-\u{0b03}\u{0b05}-\u{0b0c}\u{0b0f}-\u{0b10}\u{0b13}-\u{0b28}\u{0b2a}-\u{0b30}\u{0b32}-\u{0b33}\u{0b36}-\u{0b39}\u{0b3d}-\u{0b3e}\u{0b47}-\u{0b48}\u{0b4b}-\u{0b4c}\u{0b5c}-\u{0b5d}\u{0b5f}-\u{0b61}\u{0b66}-\u{0b70}\u{0b85}-\u{0b8a}\u{0b8e}-\u{0b90}\u{0b92}-\u{0b95}\u{0b99}-\u{0b9a}\u{0b9e}-\u{0b9f}\u{0ba3}-\u{0ba4}\u{0ba8}-\u{0baa}\u{0bae}-\u{0bb5}\u{0bb7}-\u{0bb9}\u{0bbe}-\u{0bbf}\u{0bc1}-\u{0bc2}\u{0bc6}-\u{0bc8}\u{0bca}-\u{0bcc}\u{0be7}-\u{0bf2}\u{0c01}-\u{0c03}\u{0c05}-\u{0c0c}\u{0c0e}-\u{0c10}\u{0c12}-\u{0c28}\u{0c2a}-\u{0c33}\u{0c35}-\u{0c39}\u{0c41}-\u{0c44}\u{0c60}-\u{0c61}\u{0c66}-\u{0c6f}\u{0c82}-\u{0c83}\u{0c85}-\u{0c8c}\u{0c8e}-\u{0c90}\u{0c92}-\u{0ca8}\u{0caa}-\u{0cb3}\u{0cb5}-\u{0cb9}\u{0cc0}-\u{0cc4}\u{0cc7}-\u{0cc8}\u{0cca}-\u{0ccb}\u{0cd5}-\u{0cd6}\u{0ce0}-\u{0ce1}\u{0ce6}-\u{0cef}\u{0d02}-\u{0d03}\u{0d05}-\u{0d0c}\u{0d0e}-\u{0d10}\u{0d12}-\u{0d28}\u{0d2a}-\u{0d39}\u{0d3e}-\u{0d40}\u{0d46}-\u{0d48}\u{0d4a}-\u{0d4c}\u{0d60}-\u{0d61}\u{0d66}-\u{0d6f}\u{0d82}-\u{0d83}\u{0d85}-\u{0d96}\u{0d9a}-\u{0db1}\u{0db3}-\u{0dbb}\u{0dc0}-\u{0dc6}\u{0dcf}-\u{0dd1}\u{0dd8}-\u{0ddf}\u{0df2}-\u{0df4}\u{0e01}-\u{0e30}\u{0e32}-\u{0e33}\u{0e40}-\u{0e46}\u{0e4f}-\u{0e5b}\u{0e81}-\u{0e82}\u{0e87}-\u{0e88}\u{0e94}-\u{0e97}\u{0e99}-\u{0e9f}\u{0ea1}-\u{0ea3}\u{0eaa}-\u{0eab}\u{0ead}-\u{0eb0}\u{0eb2}-\u{0eb3}\u{0ec0}-\u{0ec4}\u{0ed0}-\u{0ed9}\u{0edc}-\u{0edd}\u{0f00}-\u{0f17}\u{0f1a}-\u{0f34}\u{0f3e}-\u{0f47}\u{0f49}-\u{0f6a}\u{0f88}-\u{0f8b}\u{0fbe}-\u{0fc5}\u{0fc7}-\u{0fcc}\u{1000}-\u{1021}\u{1023}-\u{1027}\u{1029}-\u{102a}\u{1040}-\u{1057}\u{10a0}-\u{10c5}\u{10d0}-\u{10f8}\u{1100}-\u{1159}\u{115f}-\u{11a2}\u{11a8}-\u{11f9}\u{1200}-\u{1206}\u{1208}-\u{1246}\u{124a}-\u{124d}\u{1250}-\u{1256}\u{125a}-\u{125d}\u{1260}-\u{1286}\u{128a}-\u{128d}\u{1290}-\u{12ae}\u{12b2}-\u{12b5}\u{12b8}-\u{12be}\u{12c2}-\u{12c5}\u{12c8}-\u{12ce}\u{12d0}-\u{12d6}\u{12d8}-\u{12ee}\u{12f0}-\u{130e}\u{1312}-\u{1315}\u{1318}-\u{131e}\u{1320}-\u{1346}\u{1348}-\u{135a}\u{1361}-\u{137c}\u{13a0}-\u{13f4}\u{1401}-\u{1676}\u{1681}-\u{169a}\u{16a0}-\u{16f0}\u{1700}-\u{170c}\u{170e}-\u{1711}\u{1720}-\u{1731}\u{1735}-\u{1736}\u{1740}-\u{1751}\u{1760}-\u{176c}\u{176e}-\u{1770}\u{1780}-\u{17b6}\u{17be}-\u{17c5}\u{17c7}-\u{17c8}\u{17d4}-\u{17da}\u{17e0}-\u{17e9}\u{1810}-\u{1819}\u{1820}-\u{1877}\u{1880}-\u{18a8}\u{1e00}-\u{1e9b}\u{1ea0}-\u{1ef9}\u{1f00}-\u{1f15}\u{1f18}-\u{1f1d}\u{1f20}-\u{1f45}\u{1f48}-\u{1f4d}\u{1f50}-\u{1f57}\u{1f5f}-\u{1f7d}\u{1f80}-\u{1fb4}\u{1fb6}-\u{1fbc}\u{1fc2}-\u{1fc4}\u{1fc6}-\u{1fcc}\u{1fd0}-\u{1fd3}\u{1fd6}-\u{1fdb}\u{1fe0}-\u{1fec}\u{1ff2}-\u{1ff4}\u{1ff6}-\u{1ffc}\u{210a}-\u{2113}\u{2119}-\u{211d}\u{212a}-\u{212d}\u{212f}-\u{2131}\u{2133}-\u{2139}\u{213d}-\u{213f}\u{2145}-\u{2149}\u{2160}-\u{2183}\u{2336}-\u{237a}\u{249c}-\u{24e9}\u{3005}-\u{3007}\u{3021}-\u{3029}\u{3031}-\u{3035}\u{3038}-\u{303c}\u{3041}-\u{3096}\u{309d}-\u{309f}\u{30a1}-\u{30fa}\u{30fc}-\u{30ff}\u{3105}-\u{312c}\u{3131}-\u{318e}\u{3190}-\u{31b7}\u{31f0}-\u{321c}\u{3220}-\u{3243}\u{3260}-\u{327b}\u{327f}-\u{32b0}\u{32c0}-\u{32cb}\u{32d0}-\u{32fe}\u{3300}-\u{3376}\u{337b}-\u{33dd}\u{33e0}-\u{33fe}\u{3400}-\u{4db5}\u{4e00}-\u{9fa5}\u{a000}-\u{a48c}\u{ac00}-\u{d7a3}\u{e000}-\u{fa2d}\u{fa30}-\u{fa6a}\u{fb00}-\u{fb06}\u{fb13}-\u{fb17}\u{ff21}-\u{ff3a}\u{ff41}-\u{ff5a}\u{ff66}-\u{ffbe}\u{ffc2}-\u{ffc7}\u{ffca}-\u{ffcf}\u{ffd2}-\u{ffd7}\u{ffda}-\u{ffdc}\u{10300}-\u{1031e}\u{10320}-\u{10323}\u{10330}-\u{1034a}\u{10400}-\u{10425}\u{10428}-\u{1044d}\u{1d000}-\u{1d0f5}\u{1d100}-\u{1d126}\u{1d12a}-\u{1d166}\u{1d16a}-\u{1d172}\u{1d183}-\u{1d184}\u{1d18c}-\u{1d1a9}\u{1d1ae}-\u{1d1dd}\u{1d400}-\u{1d454}\u{1d456}-\u{1d49c}\u{1d49e}-\u{1d49f}\u{1d4a5}-\u{1d4a6}\u{1d4a9}-\u{1d4ac}\u{1d4ae}-\u{1d4b9}\u{1d4bd}-\u{1d4c0}\u{1d4c2}-\u{1d4c3}\u{1d4c5}-\u{1d505}\u{1d507}-\u{1d50a}\u{1d50d}-\u{1d514}\u{1d516}-\u{1d51c}\u{1d51e}-\u{1d539}\u{1d53b}-\u{1d53e}\u{1d540}-\u{1d544}\u{1d54a}-\u{1d550}\u{1d552}-\u{1d6a3}\u{1d6a8}-\u{1d7c9}\u{20000}-\u{2a6d6}\u{2f800}-\u{2fa1d}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}\p{Cs}]) + | # RandALCat preceded by LCat + \g .*? \g + ) | (?mx-i: # contains RandALCat but doesn't start with RandALCat + \A(?(?-mix:[^\u{05be 05c0 05c3 061b 061f 06dd 0710 07b1 200f fb1d fb3e}\u{05d0}-\u{05ea}\u{05f0}-\u{05f4}\u{0621}-\u{063a}\u{0640}-\u{064a}\u{066d}-\u{066f}\u{0671}-\u{06d5}\u{06e5}-\u{06e6}\u{06fa}-\u{06fe}\u{0700}-\u{070d}\u{0712}-\u{072c}\u{0780}-\u{07a5}\u{fb1f}-\u{fb28}\u{fb2a}-\u{fb36}\u{fb38}-\u{fb3c}\u{fb40}-\u{fb41}\u{fb43}-\u{fb44}\u{fb46}-\u{fbb1}\u{fbd3}-\u{fd3d}\u{fd50}-\u{fd8f}\u{fd92}-\u{fdc7}\u{fdf0}-\u{fdfc}\u{fe70}-\u{fe74}\u{fe76}-\u{fefc}])) + .*? + g + | # contains RandALCat but doesn't end with RandALCat + \g .*? \g\z + )/mx.freeze + + # Names of each codepoint table in the RFC-3454 appendices + TABLE_TITLES = { + "A.1" => "Unassigned code points in Unicode 3.2", + "B.1" => "Commonly mapped to nothing", + "B.2" => "Mapping for case-folding used with NFKC", + "B.3" => "Mapping for case-folding used with no normalization", + "C.1" => "Space characters", + "C.1.1" => "ASCII space characters", + "C.1.2" => "Non-ASCII space characters", + "C.2" => "Control characters", + "C.2.1" => "ASCII control characters", + "C.2.2" => "Non-ASCII control characters", + "C.3" => "Private use", + "C.4" => "Non-character code points", + "C.5" => "Surrogate codes", + "C.6" => "Inappropriate for plain text", + "C.7" => "Inappropriate for canonical representation", + "C.8" => "Change display properties or are deprecated", + "C.9" => "Tagging characters", + "D.1" => "Characters with bidirectional property \"R\" or \"AL\"", + "D.2" => "Characters with bidirectional property \"L\"", + }.freeze + + # Regexps matching each codepoint table in the RFC-3454 appendices + TABLE_REGEXPS = { + "A.1" => IN_A_1, + "B.1" => IN_B_1, + "B.2" => IN_B_2, + "B.3" => IN_B_3, + "C.1.1" => IN_C_1_1, + "C.1.2" => IN_C_1_2, + "C.2.1" => IN_C_2_1, + "C.2.2" => IN_C_2_2, + "C.3" => IN_C_3, + "C.4" => IN_C_4, + "C.5" => IN_C_5, + "C.6" => IN_C_6, + "C.7" => IN_C_7, + "C.8" => IN_C_8, + "C.9" => IN_C_9, + "D.1" => IN_D_1, + "D.2" => IN_D_2, + }.freeze + + end +end diff --git a/rakelib/rfcs.rake b/rakelib/rfcs.rake new file mode 100644 index 00000000..772f2eb6 --- /dev/null +++ b/rakelib/rfcs.rake @@ -0,0 +1,98 @@ +# frozen_string_literal: true + +RFCS = { + + # Core IMAP RFCs + 2060 => "IMAP4rev1 (obsolete)", + 3501 => "IMAP4rev1", # supported by nearly all email servers + 4466 => "Collected Extensions to IMAP4 ABNF", + 9051 => "IMAP4rev2", + + # RFC-9051 Normative References (not a complete list) + 2152 => "UTF-7", + 2180 => "IMAP4 Multi-Accessed Mailbox Practice", + 2683 => "IMAP4 Implementation Recommendations", + 5258 => "IMAP4 LIST-EXTENDED Extensions", + 5788 => "IMAP4 keyword registry", + 8314 => "Cleartext Considered Obsolete: Use of TLS for Email", + + # SASL + 4422 => "SASL, AUTH=EXTERNAL", + 4959 => "IMAP SASL-IR", + # stringprep + 3454 => "stringprep", + 4013 => "SASLprep", + 8265 => "PRECIS", # obsoletes SASLprep? + # SASL mechanisms (not a complete list) + 2195 => "AUTH=CRAM-MD5", + 4505 => "AUTH=ANONYMOUS", + 4616 => "AUTH=PLAIN", + 4752 => "AUTH=GSSAPI (Kerberos V5)", + 5802 => "AUTH=SCRAM-SHA-1", + 6331 => "AUTH=DIGEST-MD5", + 6595 => "AUTH=SAML20", + 6616 => "AUTH=OPENID20", + 7628 => "AUTH=OAUTH10A AUTH=OAUTHBEARER", + 7677 => "AUTH=SCRAM-SHA-256", + + # "Informational" RFCs + 1733 => "Distributed E-Mail Models in IMAP4", + 4549 => "Synchronization Operations for Disconnected IMAP4 Clients", + 6151 => "Updated Security Considerations for MD5 Message-Digest and HMAC-MD5", + + # "Best Current Practice" RFCs + 7525 => "Recommendations for Secure Use of TLS and DTLS", + + # related email specifications + 6376 => "DomainKeys Identified Mail (DKIM) Signatures", + 6409 => "Message Submission for Mail", + + # Other IMAP4 "Standards Track" RFCs + 5092 => "IMAP URL Scheme", + 5530 => "IMAP Response Codes", + 6186 => "Use of SRV Records for Locating Email Submission/Access Services", + 8305 => "Happy Eyeballs Version 2: Better Connectivity Using Concurrency", + + # IMAP4 Extensions + 2087 => "IMAP QUOTA", + 2177 => "IMAP IDLE", + 2193 => "IMAP MAILBOX-REFERRALS", + 2342 => "IMAP NAMESPACE", + 3348 => "IMAP CHILDREN", + 3516 => "IMAP BINARY", + 3691 => "IMAP UNSELECT", + 4314 => "IMAP ACL, RIGHTS=", + 4315 => "IMAP UIDPLUS", + 4731 => "IMAP ESEARCH (for controlling what is returned)", + 5161 => "IMAP ENABLE Extension", + 5182 => "IMAP SEARCHRES (for referencing the last result)", + 5255 => "IMAP I18N: LANGUAGE, I18NLEVEL={1,2}", + 5256 => "IMAP SORT, THREAD", + 5465 => "IMAP NOTIFY", + 5819 => "IMAP LIST-STATUS", + 6154 => "IMAP SPECIAL-USE, CREATE-SPECIAL-USE", + 6851 => "IMAP MOVE", + 6855 => "IMAP UTF8=", + 7162 => "IMAP CONDSTORE and QRESYNC (quick resynchronization)", + 7888 => "IMAP LITERAL+, LITERAL- (Non-synchronizing Literals)", + + 8437 => "IMAP UNAUTHENTICATE", + 8438 => "IMAP STATUS=SIZE", + 8474 => "IMAP OBJECTID", + +}.freeze + +task :rfcs => RFCS.keys.map {|n| "rfcs/rfc%04d.txt" % [n] } + +RFC_RE = %r{rfcs/rfc(\d+).*\.txt}.freeze +rule RFC_RE do |t| + require "fileutils" + FileUtils.mkpath "rfcs" + require "net/http" + t.name =~ RFC_RE + rfc_url = URI("https://www.rfc-editor.org/rfc/rfc#$1.txt") + rfc_txt = Net::HTTP.get(rfc_url) + File.write(t.name, rfc_txt) +end + +CLEAN.include "rfcs/rfc*.txt" diff --git a/rakelib/saslprep.rake b/rakelib/saslprep.rake new file mode 100644 index 00000000..5566aa46 --- /dev/null +++ b/rakelib/saslprep.rake @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +require_relative "string_prep_tables_generator" + +generator = StringPrepTablesGenerator.new + +file generator.json_filename => generator.json_deps do |t| + generator.generate_json_data_file +end + +directory "lib/net/imap/sasl" + +file "lib/net/imap/sasl/stringprep_tables.rb" => generator.rb_deps do |t| + File.write t.name, generator.stringprep_rb +end + +file "lib/net/imap/sasl/saslprep_tables.rb" => generator.rb_deps do |t| + File.write t.name, generator.saslprep_rb +end + +GENERATED_RUBY = FileList.new( + "lib/net/imap/sasl/stringprep_tables.rb", + "lib/net/imap/sasl/saslprep_tables.rb", +) + +CLEAN.include generator.clean_deps +CLOBBER.include GENERATED_RUBY + +task saslprep_rb: GENERATED_RUBY +task test: :saslprep_rb diff --git a/rakelib/string_prep_tables_generator.rb b/rakelib/string_prep_tables_generator.rb new file mode 100644 index 00000000..76e91b7c --- /dev/null +++ b/rakelib/string_prep_tables_generator.rb @@ -0,0 +1,423 @@ +# frozen_string_literal: true + +# Generator for stringprep regexps. +# +# Combines Unicode character classes with generated tables. Generated regexps +# are still used to test that the written regexps conform to the specification. +# Some tables don't match up well with any character properties available to +# ruby's regexp engine. Those use the table-generated regexps. +class StringPrepTablesGenerator + STRINGPREP_RFC_FILE = "rfcs/rfc3454.txt" + STRINGPREP_JSON_FILE = "rfcs/rfc3454-stringprep_tables.json" + + # valid UTF-8 can't contain these codepoints + # checking for them anyway, using /\p{Cs}/ ;) + SURROGATES_RANGE = 0xD800..0xDFFF + + attr_reader :json_filename, :rfc_filename + + def initialize(rfc_filename: STRINGPREP_RFC_FILE, + json_filename: STRINGPREP_JSON_FILE) + @rfc_filename = rfc_filename + @json_filename = json_filename + end + + # for rake deps + def json_deps; Rake::FileList.new __FILE__, STRINGPREP_RFC_FILE end + def rb_deps; Rake::FileList.new __FILE__, STRINGPREP_JSON_FILE end + def clean_deps; Rake::FileList.new STRINGPREP_JSON_FILE end + + def generate_json_data_file + require "json" + rfc_filename + .then(&File.method(:read)) + .then(&method(:parse_rfc_text)) + .then(&JSON.method(:pretty_generate)) + .then {|data| File.write json_filename, data } + end + + def tables; @tables ||= load_tables_and_titles_from_json!.first end + def titles; @titles ||= load_tables_and_titles_from_json!.last end + def ranges; @ranges ||= tables.transform_values(&method(:to_ranges)) end + def arrays; @arrays ||= ranges.transform_values{|t| t.flat_map(&:to_a) } end + def sets; @sets ||= arrays.transform_values(&:to_set) end + def regexps; @regexps ||= arrays.transform_values(&method(:to_regexp)) end + def asgn_regexps; @asgn_regexps || asgn_regexps! end + + def merged_tables_regex(*table_names, negate: false) + table_names + .flat_map(&arrays.method(:fetch)) + .then {|array| to_regexp(array, negate: negate) } + end + + def regexp_for(*names, negate: false) + asgn_regexps[[*names, negate]] ||= merged_tables_regex(*names, negate: negate) + end + + def stringprep_rb + <<~RUBY + # frozen_string_literal: true + + #-- + # This file is generated from RFC3454, by rake. Don't edit directly. + #++ + + module Net::IMAP::SASL + + module StringPrep + + #{asgn_table "A.1"} + + #{asgn_table "B.1"} + + #{asgn_table "B.2"} + + #{asgn_table "B.3"} + + #{asgn_table "C.1.1"} + + #{asgn_table "C.1.2"} + + #{asgn_table "C.2.1"} + + #{asgn_table "C.2.2"} + + #{asgn_table "C.3"} + + #{asgn_table "C.4"} + + #{asgn_table "C.5"} + + #{asgn_table "C.6"} + + #{asgn_table "C.7"} + + #{asgn_table "C.8"} + + #{asgn_table "C.9"} + + #{asgn_table "D.1"} + + # Used to check req3 of bidirectional checks + #{asgn_table "D.1", negate: true} + + #{asgn_table "D.2"} + + BIDI_DESC_REQ2 = "A string with RandALCat characters must not contain LCat characters." + + # Bidirectional Characters [StringPrep, §6], Requirement 2:: + # If a string contains any RandALCat character, the string MUST NOT + # contain any LCat character. + BIDI_FAILS_REQ2 = #{bidi_fails_req2.inspect}.freeze + + BIDI_DESC_REQ3 = "A string with RandALCat characters must start and end with RandALCat characters." + + # Bidirectional Characters [StringPrep, §6], Requirement 3:: + # If a string contains any RandALCat character, a RandALCat + # character MUST be the first character of the string, and a + # RandALCat character MUST be the last character of the string. + BIDI_FAILS_REQ3 = #{bidi_fails_req3.inspect}.freeze + + # Bidirectional Characters [StringPrep, §6] + BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze + + # Names of each codepoint table in the RFC-3454 appendices + TABLE_TITLES = { + #{table_titles_rb} + }.freeze + + # Regexps matching each codepoint table in the RFC-3454 appendices + TABLE_REGEXPS = { + #{table_regexps_rb} + }.freeze + + end + end + RUBY + end + + def table_titles_rb(indent = 3) + titles + .map{|t| "%p => %p," % t } + .join("\n#{" "*indent}") + end + + def table_regexps_rb(indent = 3) + asgn_regexps # => { ["A.1", false] => regexp, ... } + .reject {|(_, n), _| n } + .map {|(t, _), _| "%p => %s," % [t, regexp_const_name(t)] } + .join("\n#{" "*indent}") + end + + def saslprep_rb + <<~RUBY + # frozen_string_literal: true + + #-- + # This file is generated from RFC3454, by rake. Don't edit directly. + #++ + + module Net::IMAP::SASL + + module SASLprep + + # RFC4013 §2.1 Mapping - mapped to space + # * non-ASCII space characters (\\StringPrep\\[\\"C.1.2\\"]) that can be + # mapped to SPACE (U+0020), and + # + # Equal to \\StringPrep\\[\\"C.1.2\\"]. + # Redefined here to avoid loading the StringPrep module. + MAP_TO_SPACE = #{regex_str "C.1.2"} + + # RFC4013 §2.1 Mapping - mapped to nothing + # the "commonly mapped to nothing" characters (\\StringPrep\\[\\"B.1\\"]) + # that can be mapped to nothing. + # + # Equal to \\StringPrep\\[\\"B.1\\"]. + # Redefined here to avoid loading the StringPrep module. + MAP_TO_NOTHING = #{regex_str "B.1"} + + # RFC4013 §2.3 Prohibited Output:: + # * Non-ASCII space characters — \\StringPrep\\[\\"C.1.2\\"] + # * ASCII control characters — \\StringPrep\\[\\"C.2.1\\"] + # * Non-ASCII control characters — \\StringPrep\\[\\"C.2.2\\"] + # * Private Use characters — \\StringPrep\\[\\"C.3\\"] + # * Non-character code points — \\StringPrep\\[\\"C.4\\"] + # * Surrogate code points — \\StringPrep\\[\\"C.5\\"] + # * Inappropriate for plain text characters — \\StringPrep\\[\\"C.6\\"] + # * Inappropriate for canonical representation characters — \\StringPrep\\[\\"C.7\\"] + # * Change display properties or deprecated characters — \\StringPrep\\[\\"C.8\\"] + # * Tagging characters — \\StringPrep\\[\\"C.9\\"] + TABLES_PROHIBITED = #{SASL_TABLES_PROHIBITED.inspect}.freeze + + # Adds unassigned (by Unicode 3.2) codepoints to TABLES_PROHIBITED. + # + # RFC4013 §2.5 Unassigned Code Points:: + # This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its list of + # unassigned code points. + TABLES_PROHIBITED_STORED = ["A.1", *TABLES_PROHIBITED].freeze + + # Matches codepoints prohibited by RFC4013 §2.3. + # + # See TABLES_PROHIBITED. + # + # Equal to +Regexp.union+ of the TABLES_PROHIBITED tables. Redefined + # here to avoid loading the StringPrep module unless necessary. + PROHIBITED_OUTPUT = #{regex_str(*SASL_TABLES_PROHIBITED)} + + # RFC4013 §2.5 Unassigned Code Points:: + # This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its list of + # unassigned code points. + UNASSIGNED = #{regex_str "A.1"} + + # Matches codepoints prohibited by RFC4013 §2.3 and §2.5. + # + # See TABLES_PROHIBITED_STORED. + PROHIBITED_OUTPUT_STORED = Regexp.union( + UNASSIGNED, PROHIBITED_OUTPUT + ).freeze + + # Bidirectional Characters [StringPrep, §6] + BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze + + # Matches strings prohibited by RFC4013 §2.3 and §2.4. + # + # This checks prohibited output and bidirectional characters. + PROHIBITED = Regexp.union( + PROHIBITED_OUTPUT, BIDI_FAILURE, + ) + + # Matches strings prohibited by RFC4013 §2.3, §2.4, and §2.5. + # + # This checks prohibited output, bidirectional characters, and + # unassigned codepoints. + PROHIBITED_STORED = Regexp.union( + PROHIBITED_OUTPUT_STORED, BIDI_FAILURE, + ) + + end + end + RUBY + end + + private + + def parse_rfc_text(rfc3454_text) + titles = {} + tables, = rfc3454_text + .lines + .each_with_object([]) {|line, acc| + current, table = acc.last + case line + when /^([A-D]\.[1-9](?:\.[1-9])?) (.*)/ + titles[$1] = $2 + when /^ {3}-{5} Start Table (\S*)/ + acc << [$1, []] + when /^ {3}-{5} End Table / + acc << [nil, nil] + when /^ {3}([0-9A-F]+); ([ 0-9A-F]*)(?:;[^;]*)$/ # mapping tables + table << [$1, $2.split(/ +/)] if current + when /^ {3}([-0-9A-F]+)(?:;[^;]*)?$/ # regular tables + table << $1 if current + when /^ {3}(.*)/ + raise "expected to match %p" % $1 if current + end + } + .to_h.compact + .transform_values {|t| t.first.size == 2 ? t.to_h : t } + tables["titles"] = titles + tables + end + + def load_tables_and_titles_from_json! + require "json" + @tables = json_filename + .then(&File.method(:read)) + .then(&JSON.method(:parse)) + @titles = @tables.delete "titles" + [@tables, @titles] + end + + def to_ranges(table) + (table.is_a?(Hash) ? table.keys : table) + .map{|range| range.split(?-).map{|cp| Integer cp, 16} } + .map{|s,e| s..(e || s)} + end + + # Starting from a codepoints array (rather than ranges) to deduplicate merged + # tables. + def to_regexp(codepoints, negate: false) + codepoints + .grep_v(SURROGATES_RANGE) # remove surrogate codepoints from C.5 and D.2 + .uniq + .sort + .chunk_while {|cp1,cp2| cp1 + 1 == cp2 } # find contiguous chunks + .map {|chunk| chunk.map{|cp| "%04x" % cp } } # convert to hex strings + .partition {|chunk| chunk[1] } # ranges vs singles + .then {|ranges, singles| + singles.flatten! + [ + negate ? "^" : "", + singles.flatten.any? ? "\\u{%s}" % singles.join(" ") : "", + ranges.map {|r| "\\u{%s}-\\u{%s}" % [r.first, r.last] }.join, + codepoints.any?(SURROGATES_RANGE) ? "\\p{Cs}" : "", # not necessary :) + ].join + } + .then {|char_class| Regexp.new "[#{char_class}]" } + end + + def asgn_regexps! + @asgn_regexps = {} + # preset the regexp for each table + asgn_regex "A.1", /\p{^AGE=3.2}/ + # If ruby supported all unicode properties (i.e. line break = word joiner): + # /[\u{00ad 034f 1806}\p{join_c}\p{VS}\p{lb=WJ}&&\p{age=3.2}]/ + asgn_table "B.1" + asgn_table "B.2" + asgn_table "B.3" + asgn_regex "C.1.1", / / + asgn_regex "C.1.2", /[\u200b\p{Zs}&&[^ ]]/ + asgn_regex "C.2.1", /[\x00-\x1f\x7f]/ + # C.2.2 is a union: + # Cc + Cf (as defined by Unicode 3.2) + Zl + Zp + 0xfffc + # - any codepoints covered by C.2.1 or C.8 or C.9 + # + # But modern Unicode properties are significantly different, so it's better + # to just load the table definition. + asgn_table "C.2.2" + asgn_regex "C.3", /\p{private use}/ + asgn_regex "C.4", /\p{noncharacter code point}/ + asgn_regex "C.5", /\p{surrogate}/ + asgn_regex "C.6", /[\p{in specials}&&\p{AGE=3.2}&&\p{^NChar}]/ + asgn_regex "C.7", /[\p{in ideographic description characters}&&\p{AGE=3.2}]/ + # C.8 is a union of \p{Bidi Control} and Unicode 3.2 properties. But those properties + # have changed for modern Unicode, and thus for modern ruby's regexp + # character properties. It's better to just load the table definition. + asgn_table "C.8" + asgn_regex "C.9", /[\p{in Tags}&&\p{AGE=3.2}]/ + # Unfortunately, ruby doesn't (currently) support /[\p{Bidi + # Class=R}\p{bc=AL}]/. On the other hand, StringPrep (based on Unicode 3.2) + # might not be a good match for the modern (14.0) property value anyway. + asgn_table "D.1" + asgn_table "D.1", negate: true # used by BIDI_FAILS_REQ3 + asgn_table "D.2" + @asgn_regexps + end + + def regex_str(*names, negate: false) + "%p.freeze" % regexp_for(*names, negate: negate) + end + + def asgn_table(name, negate: false) + asgn_regex(name, regexp_for(name, negate: negate), negate: negate) + end + + def regexp_const_desc(name, negate: false) + if negate then "Matches the negation of the %s table" % [name] + else %q{%s \\StringPrep\\[\\"%s\\"]} % [titles.fetch(name), name] + end + end + + def regexp_const_name(table_name, negate: false) + "IN_%s%s" % [table_name.tr(".", "_"), negate ? "_NEGATED" : ""] + end + + def asgn_regex(name, regexp, negate: false) + asgn_regexps[[name, negate]] = regexp + "# %s\n%s%s = %p.freeze" % [ + regexp_const_desc(name, negate: negate), " " * 4, + regexp_const_name(name, negate: negate), + regexp, + ] + end + + def bidi_R_AL ; regexp_for "D.1" end + def bidi_not_R_AL ; regexp_for "D.1", negate: true end + def bidi_L ; regexp_for "D.2" end + + def bidi_fails_req2 + / # RandALCat followed by LCat + (?#{bidi_R_AL.source}) + .*? + (?#{bidi_L.source}) + | # RandALCat preceded by LCat + \g .*? \g + /mux + end + + def bidi_fails_req3 + / # contains RandALCat but doesn't start with RandALCat + \A(?#{bidi_not_R_AL}) + .*? + (?#{bidi_R_AL}) + | # contains RandALCat but doesn't end with RandALCat + \g .*? \g\z + /mux + end + + # shares the bidi_R_AL definition between both req2 and req3 + def bidi_failure_regexp + req3_with_backref = bidi_fails_req3.source + .gsub(%r{\(\?\\(.*?\)\)}, "\g") + .then{|re|"(?mx-i:#{re})"} + /#{bidi_fails_req2} | #{req3_with_backref}/mux + end + + def bidi_consts + <<~RUBY + ############# + # Bidirectional checks. + # + + RUBY + end + + SASL_TABLES_PROHIBITED = %w[ + C.1.2 C.2.1 C.2.2 C.3 C.4 C.5 C.6 C.7 C.8 C.9 + ].freeze + + SASL_TABLES_PROHIBITED_STORED = %w[ + A.1 C.1.2 C.2.1 C.2.2 C.3 C.4 C.5 C.6 C.7 C.8 C.9 + ].freeze + +end diff --git a/test/net/imap/test_saslprep.rb b/test/net/imap/test_saslprep.rb new file mode 100644 index 00000000..29b7f10b --- /dev/null +++ b/test/net/imap/test_saslprep.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require "net/imap" +require "test/unit" + +class SASLprepTest < Test::Unit::TestCase + include Net::IMAP::SASL + + # Test cases from RFC-4013 §3: + # + # # Input Output Comments + # - ----- ------ -------- + # 1 IX IX SOFT HYPHEN mapped to nothing + # 2 user user no transformation + # 3 USER USER case preserved, will not match #2 + # 4 a output is NFKC, input in ISO 8859-1 + # 5 IX output is NFKC, will match #1 + def test_saslprep_valid_inputs + { + "I\u00ADX" => "IX", # SOFT HYPHEN mapped to nothing + "user" => "user", # no transformation + "USER" => "USER", # case preserved, will not match #2 + "\u00aa" => "a", # output is NFKC, input in ISO 8859-1 + "\u2168" => "IX", # output is NFKC, will match #1 + # some more tests: + "foo\u00a0bar" => "foo bar", # map to space + "foo\u2000bar" => "foo bar", # map to space + "foo\u3000bar" => "foo bar", # map to space + "\u0627" => "\u0627", # single RandALCat char is okay + "\u{1f468}\u200d\u{1f469}\u200d\u{1f467}" => + "\u{1f468}\u{1f469}\u{1f467}" # map ZWJ to nothing + }.each do |input, output| + assert_equal output, Net::IMAP.saslprep(input) + end + end + + # Test cases from RFC-4013 §3: + # + # # Input Output Comments + # - ----- ------ -------- + # 6 Error - prohibited character + # 7 Error - bidirectional check + def test_saslprep_invalid_inputs + { + # from the RFC examples table + "\u0007" => [ProhibitedCodepoint, /ASCII control character/], + "\u0627\u0031" => [BidiStringError, /must start.*end with RandAL/], + # some more prohibited codepoints + "\x7f" => [ProhibitedCodepoint, /ASCII control character/i], + "\ufff9" => [ProhibitedCodepoint, /Non-ASCII control character/i], + "\ue000" => [ProhibitedCodepoint, /private use.*C.3/i], + "\u{f0000}" => [ProhibitedCodepoint, /private use.*C.3/i], + "\u{100000}" => [ProhibitedCodepoint, /private use.*C.3/i], + "\ufffe" => [ProhibitedCodepoint, /Non-character code point.*C.4/i], + "\xed\xa0\x80" => [StringPrepError, /invalid byte seq\w+ in UTF-8/i], + "\ufffd" => [ProhibitedCodepoint, /inapprop.* plain text.*C.6/i], + "\u2FFb" => [ProhibitedCodepoint, /inapprop.* canonical rep.*C.7/i], + "\u202c" => [ProhibitedCodepoint, /change display.*deprecate.*C.8/i], + "\u{e0001}" => [ProhibitedCodepoint, /tagging character/i], + # some more invalid bidirectional characters + "\u0627abc\u0627" => [BidiStringError, /must not contain.* Lcat/i], + "\u0627123" => [BidiStringError, /must start.*end with RandAL/i], + }.each do |input, (err, msg)| + assert_nil Net::IMAP.saslprep input + assert_raise_with_message err, msg do + Net::IMAP.saslprep(input, exception: true) + end + end + end + +end diff --git a/test/net/imap/test_stringprep.rb b/test/net/imap/test_stringprep.rb new file mode 100644 index 00000000..511924e9 --- /dev/null +++ b/test/net/imap/test_stringprep.rb @@ -0,0 +1,75 @@ +# frozen_string_literal: true + +require "net/imap" +require "test/unit" +require "json" +require "set" + +require_relative "../../../rakelib/string_prep_tables_generator" + +class StringPrepTest < Test::Unit::TestCase + include Net::IMAP::SASL + + # Surrogates are excluded. They are handled by enforcing valid UTF8 encoding. + VALID_CODEPOINTS = (0..0x10_ffff).map{|cp| cp.chr("UTF-8") rescue nil}.compact + + rfc3454_generator = StringPrepTablesGenerator.new + + # testing with set inclusion, just in case the regexp generation is buggy + RFC3454_TABLE_SETS = rfc3454_generator.sets + + # The library regexps are a mixture of generated vs handcrafted, in order to + # reduce load-time and memory footprint of the largest tables. + # + # These are the simple generated regexps, which directly translate each table + # into a character class with every codepoint. These can be used to verify + # the hand-crafted regexps are correct, for every supported version of ruby. + RFC3454_TABLE_REGEXPS = rfc3454_generator.regexps + + # C.5 (surrogates) aren't really tested here. + # D.2 includes surrogates... which also aren't tested here. + # + # This is ok: valid UTF-8 encoding is enforced and cannot contain surrogates. + + def test_rfc3454_table_A_1; assert_rfc3454_table_compliance "A.1" end + def test_rfc3454_table_B_1; assert_rfc3454_table_compliance "B.1" end + def test_rfc3454_table_B_2; assert_rfc3454_table_compliance "B.2" end + def test_rfc3454_table_C_1_1; assert_rfc3454_table_compliance "C.1.1" end + def test_rfc3454_table_C_1_2; assert_rfc3454_table_compliance "C.1.2" end + def test_rfc3454_table_C_2_1; assert_rfc3454_table_compliance "C.2.1" end + def test_rfc3454_table_C_2_2; assert_rfc3454_table_compliance "C.2.2" end + def test_rfc3454_table_C_3; assert_rfc3454_table_compliance "C.3" end + def test_rfc3454_table_C_4; assert_rfc3454_table_compliance "C.4" end + def test_rfc3454_table_C_5; assert_rfc3454_table_compliance "C.5" end + def test_rfc3454_table_C_6; assert_rfc3454_table_compliance "C.6" end + def test_rfc3454_table_C_7; assert_rfc3454_table_compliance "C.7" end + def test_rfc3454_table_C_8; assert_rfc3454_table_compliance "C.8" end + def test_rfc3454_table_C_9; assert_rfc3454_table_compliance "C.9" end + def test_rfc3454_table_D_1; assert_rfc3454_table_compliance "D.1" end + def test_rfc3454_table_D_2; assert_rfc3454_table_compliance "D.2" end + + def assert_rfc3454_table_compliance(name) + set = RFC3454_TABLE_SETS .fetch(name) + regexp = RFC3454_TABLE_REGEXPS .fetch(name) + coded = StringPrep::TABLE_REGEXPS.fetch(name) + matched_set = VALID_CODEPOINTS + .map{|cp| cp.unpack1 "U"} + .grep(set) + .map{|cp| [cp].pack "U"} + matched_reg = VALID_CODEPOINTS.grep(regexp) + matched_lib = VALID_CODEPOINTS.grep(coded) + assert_not_empty matched_lib unless name == "C.5" + # assert_equal freezes up on errors; too much data to pretty print. + # and printing out weird unicode control characters (etc) isn't very useful. + if matched_lib != matched_reg + missing = (matched_reg - matched_lib).map{|s|"%04x" % s.codepoints.first} + extra = (matched_lib - matched_reg).map{|s|"%04x" % s.codepoints.first} + assert_empty missing.first(100), "missing some codepoints" + assert_empty extra.first(100), "some extra codepoints" + else + assert_equal matched_lib, matched_reg + assert_equal matched_lib, matched_set + end + end + +end