Canonicalize tlangs to lowercase (#4134)

unicode-org · Oct 14, 2023 · c8f45ed · c8f45ed
1 parent 871c238
commit c8f45ed
Show file tree

Hide file tree

Showing 12 changed files with 143 additions and 25 deletions.
diff --git a/components/locid/src/extensions/mod.rs b/components/locid/src/extensions/mod.rs
@@ -23,7 +23,7 @@
 //! use icu::locid::extensions::unicode::{Key, Value};
 //! use icu::locid::Locale;
 //!
-//! let loc: Locale = "en-US-u-ca-buddhist-t-en-US-h0-hybrid-x-foo"
+//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
 //!     .parse()
 //!     .expect("Failed to parse.");
 //!

diff --git a/components/locid/src/extensions/transform/mod.rs b/components/locid/src/extensions/transform/mod.rs
@@ -16,7 +16,7 @@
 //! use icu::locid::{LanguageIdentifier, Locale};
 //!
 //! let mut loc: Locale =
-//!     "en-US-t-es-AR-h0-hybrid".parse().expect("Parsing failed.");
+//!     "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed.");
 //!
 //! let lang: LanguageIdentifier =
 //!     "es-AR".parse().expect("Parsing LanguageIdentifier failed.");
@@ -28,7 +28,7 @@
 //! assert!(loc.extensions.transform.fields.contains_key(&key));
 //! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
 //!
-//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-AR-h0-hybrid");
+//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid");
 //! ```
 mod fields;
 mod key;
@@ -60,7 +60,7 @@ use litemap::LiteMap;
 /// use icu::locid::{LanguageIdentifier, Locale};
 ///
 /// let mut loc: Locale =
-///     "de-t-en-US-h0-hybrid".parse().expect("Parsing failed.");
+///     "de-t-en-us-h0-hybrid".parse().expect("Parsing failed.");
 ///
 /// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed.");
 ///
@@ -107,7 +107,7 @@ impl Transform {
     /// ```
     /// use icu::locid::Locale;
     ///
-    /// let mut loc: Locale = "en-US-t-es-AR".parse().expect("Parsing failed.");
+    /// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed.");
     ///
     /// assert!(!loc.extensions.transform.is_empty());
     /// ```
@@ -122,7 +122,7 @@ impl Transform {
     /// ```
     /// use icu::locid::Locale;
     ///
-    /// let mut loc: Locale = "en-US-t-es-AR".parse().unwrap();
+    /// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap();
     /// loc.extensions.transform.clear();
     /// assert_eq!(loc, "en-US".parse().unwrap());
     /// ```
@@ -196,7 +196,7 @@ impl Transform {
         }
         f("t")?;
         if let Some(lang) = &self.lang {
-            lang.for_each_subtag_str(f)?;
+            lang.for_each_subtag_str_lowercased(f)?;
         }
         self.fields.for_each_subtag_str(f)
     }
@@ -212,7 +212,7 @@ impl writeable::Writeable for Transform {
         sink.write_str("t")?;
         if let Some(lang) = &self.lang {
             sink.write_char('-')?;
-            writeable::Writeable::write_to(lang, sink)?;
+            lang.write_lowercased_to(sink)?;
         }
         if !self.fields.is_empty() {
             sink.write_char('-')?;

diff --git a/components/locid/src/langid.rs b/components/locid/src/langid.rs
@@ -327,6 +327,74 @@ impl LanguageIdentifier {
         }
         Ok(())
     }
+
+    /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
+    /// lowercase ascii form.
+    ///
+    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
+    ///
+    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
+    /// ordered by the separators, alphabetically._
+    ///
+    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
+    /// canonicalization of the language identifier.
+    ///
+    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
+    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
+    /// but titlecased and uppercased outside T extensions respectively.
+    ///
+    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
+    /// [`Transform extensions`]: crate::extensions::transform
+    pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
+    where
+        F: FnMut(&str) -> Result<(), E>,
+    {
+        f(self.language.as_str())?;
+        if let Some(ref script) = self.script {
+            f(script.into_tinystr().to_ascii_lowercase().as_str())?;
+        }
+        if let Some(ref region) = self.region {
+            f(region.into_tinystr().to_ascii_lowercase().as_str())?;
+        }
+        for variant in self.variants.iter() {
+            f(variant.as_str())?;
+        }
+        Ok(())
+    }
+
+    /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
+    /// lowercase ascii chars.
+    ///
+    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
+    ///
+    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
+    /// ordered by the separators, alphabetically._
+    ///
+    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
+    /// canonicalization of the language identifier.
+    ///
+    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
+    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
+    /// but titlecased and uppercased outside T extensions respectively.
+    ///
+    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
+    /// [`Transform extensions`]: crate::extensions::transform
+    pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
+        &self,
+        sink: &mut W,
+    ) -> core::fmt::Result {
+        let mut initial = true;
+        self.for_each_subtag_str_lowercased(&mut |subtag| {
+            if initial {
+                initial = false;
+            } else {
+                sink.write_char('-')?;
+            }
+            sink.write_str(subtag)
+        })
+    }
 }
 
 impl AsRef<LanguageIdentifier> for LanguageIdentifier {

diff --git a/components/locid/tests/fixtures/canonicalize.json b/components/locid/tests/fixtures/canonicalize.json
@@ -14,5 +14,55 @@
   {
     "input": "en-scouse-fonipa",
     "output": "en-fonipa-scouse"
+  },
+  {
+    "input": {
+      "type": "Locale",
+      "identifier": "en-US-t-es-AR-x-foo"
+    },
+    "output": {
+      "type": "Locale",
+      "identifier": "en-US-t-es-ar-x-foo"
+    }
+  },
+  {
+    "input": {
+      "type": "Locale",
+      "identifier": "en-t-en-Latn-CA-emodeng"
+    },
+    "output": {
+      "type": "Locale",
+      "identifier": "en-t-en-latn-ca-emodeng"
+    }
+  },
+  {
+    "input": {
+      "type": "Locale",
+      "identifier": "EN-US-T-ES-AR-X-FOO"
+    },
+    "output": {
+      "type": "Locale",
+      "identifier": "en-US-t-es-ar-x-foo"
+    }
+  },
+  {
+    "input": {
+      "type": "Locale",
+      "identifier": "EN-T-EN-LATN-CA-EMODENG"
+    },
+    "output": {
+      "type": "Locale",
+      "identifier": "en-t-en-latn-ca-emodeng"
+    }
+  },
+  {
+    "input": {
+      "type": "Locale",
+      "identifier": "UND-CYRL-T-ES-LATN-M0-UNGEGN"
+    },
+    "output": {
+      "type": "Locale",
+      "identifier": "und-Cyrl-t-es-latn-m0-ungegn"
+    }
   }
 ]
diff --git a/components/locid/tests/fixtures/locale.json b/components/locid/tests/fixtures/locale.json
@@ -75,7 +75,7 @@
   {
     "input": {
       "type": "Locale",
-      "identifier": "en-US-t-pl-Latn-DE"
+      "identifier": "en-US-t-pl-latn-de"
     },
     "output": {
       "type": "Locale",
@@ -125,7 +125,7 @@
   {
     "input": {
       "type": "Locale",
-      "identifier": "en-US-t-es-AR-x-foo"
+      "identifier": "en-US-t-es-ar-x-foo"
     },
     "output": {
       "type": "Locale",
@@ -142,7 +142,7 @@
   {
     "input": {
       "type": "Locale",
-      "identifier": "en-US-u-ca-buddhist-hc-h12-t-es-AR-h0-hybrid-x-private-foobar"
+      "identifier": "en-US-u-ca-buddhist-hc-h12-t-es-ar-h0-hybrid-x-private-foobar"
     },
     "output": {
       "type": "Locale",

diff --git a/components/locid/tests/locale.rs b/components/locid/tests/locale.rs
@@ -72,10 +72,10 @@ fn test_locale_conversions() {
 
 #[test]
 fn test_locale_canonicalize() {
-    let locale: Locale = "En-latn-US-MacOS"
-        .parse()
-        .expect("Failed to parse a locale.");
-    assert_writeable_eq!(locale, Locale::canonicalize("eN-latN-uS-macOS").unwrap());
+    let path = "./tests/fixtures/canonicalize.json";
+    let data = helpers::read_fixture(path).expect("Failed to read a fixture");
+
+    test_langid_fixtures(data);
 }
 
 #[test]

diff --git a/experimental/transliterate/tests/data/baked/macros/transliterator_rules_v1.data.rs b/experimental/transliterate/tests/data/baked/macros/transliterator_rules_v1.data.rs
diff --git a/provider/datagen/tests/data/json/transliterator/rules@1/und+de-t-de-d0-ascii.json b/provider/datagen/tests/data/json/transliterator/rules@1/und+de-t-de-d0-ascii.json
diff --git a/...ator/rules@1/und+und-Arab-t-und-Beng.json → ...ator/rules@1/und+und-Arab-t-und-beng.json b/...ator/rules@1/und+und-Arab-t-und-Beng.json → ...ator/rules@1/und+und-Arab-t-und-beng.json
diff --git a/...les@1/und+und-t-und-Beng-d0-intindic.json → ...les@1/und+und-t-und-beng-d0-intindic.json b/...les@1/und+und-t-und-Beng-d0-intindic.json → ...les@1/und+und-t-und-beng-d0-intindic.json
diff --git a/.../rules@1/und+und-t-und-Latn-d0-ascii.json → .../rules@1/und+und-t-und-latn-d0-ascii.json b/.../rules@1/und+und-t-und-Latn-d0-ascii.json → .../rules@1/und+und-t-und-latn-d0-ascii.json
diff --git a/provider/datagen/tests/data/postcard/fingerprints.csv b/provider/datagen/tests/data/postcard/fingerprints.csv
@@ -2063,20 +2063,20 @@ time_zone/specific_short@1, sr-Latn, 119B, dcdb9855b7df4f90
 time_zone/specific_short@1, th, 31B, 4b7af6a019fab889
 time_zone/specific_short@1, tr, 31B, 4b7af6a019fab889
 time_zone/specific_short@1, und, 31B, 4b7af6a019fab889
-transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 373dc989a6f2feb0
+transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 7504fd6441311f8b
 transliterator/rules@1, und+el-Latn-t-el-m0-bgn, 13802B, 676fd7d03e5f65ba
 transliterator/rules@1, und+und-Arab-t-s0-intindic, 24093B, bb464298570e790f
-transliterator/rules@1, und+und-Arab-t-und-Beng, 320B, 6910c1371f9ed898
+transliterator/rules@1, und+und-Arab-t-und-beng, 320B, 740b227a94b146ff
 transliterator/rules@1, und+und-Latn-t-s0-ascii, 109B, c4235cb150e12966
 transliterator/rules@1, und+und-t-d0-publish, 3476B, 8b78371a1427663b
 transliterator/rules@1, und+und-t-s0-publish, 1342B, fc819d57a6653613
-transliterator/rules@1, und+und-t-und-Beng-d0-intindic, 2620B, 5d7d726babccafe7
-transliterator/rules@1, und+und-t-und-Latn-d0-ascii, 27083B, 5098d1af741181a3
+transliterator/rules@1, und+und-t-und-beng-d0-intindic, 2620B, 5d7d726babccafe7
 transliterator/rules@1, und+und-t-und-d0-test-m0-cursfilt-s0-test, 92B, a3f0d5ed65cba360
 transliterator/rules@1, und+und-t-und-d0-test-m0-emtymach-s0-test, 104B, fd8a17f7ffe5a325
 transliterator/rules@1, und+und-t-und-d0-test-m0-hexrust-s0-test, 77B, bd697bfcd06ad4ca
 transliterator/rules@1, und+und-t-und-d0-test-m0-hexuni-s0-test, 80B, 55d96425b75e5ac8
 transliterator/rules@1, und+und-t-und-d0-test-m0-niels-s0-test, 1769B, 45400449cf43ecf6
 transliterator/rules@1, und+und-t-und-d0-test-m0-rectesta-s0-test, 369B, 69c41d4b5c828833
 transliterator/rules@1, und+und-t-und-d0-test-m0-rectestr-s0-test, 237B, 3345ed066cbb729f
+transliterator/rules@1, und+und-t-und-latn-d0-ascii, 27083B, 5098d1af741181a3
 units/constants@1, und, 426B, e0c7eeb9e702371c