Document new methods and rename

unicode-org · Oct 9, 2023 · 4a6c072 · 4a6c072
1 parent ebeda52
commit 4a6c072
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 5 deletions.
diff --git a/components/locid/src/extensions/transform/mod.rs b/components/locid/src/extensions/transform/mod.rs
@@ -196,7 +196,7 @@ impl Transform {
         }
         f("t")?;
         if let Some(lang) = &self.lang {
-            lang.for_each_subtag_str_lowercase(f)?;
+            lang.for_each_subtag_str_lowercased(f)?;
         }
         self.fields.for_each_subtag_str(f)
     }
@@ -212,7 +212,7 @@ impl writeable::Writeable for Transform {
         sink.write_str("t")?;
         if let Some(lang) = &self.lang {
             sink.write_char('-')?;
-            lang.write_to_lowercase(sink)?;
+            lang.write_lowercased_to(sink)?;
         }
         if !self.fields.is_empty() {
             sink.write_char('-')?;

diff --git a/components/locid/src/langid.rs b/components/locid/src/langid.rs
@@ -328,7 +328,25 @@ impl LanguageIdentifier {
         Ok(())
     }
 
-    pub(crate) fn for_each_subtag_str_lowercase<E, F>(&self, f: &mut F) -> Result<(), E>
+    /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
+    /// lowercase ascii form.
+    ///
+    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
+    ///
+    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
+    /// ordered by the separators, alphabetically._
+    ///
+    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
+    /// canonicalization of the language identifier.
+    ///
+    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
+    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
+    /// but titlecased and uppercased outside T extensions respectively.
+    ///
+    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
+    /// [`Transform extensions`]: crate::extensions::transform
+    pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
     where
         F: FnMut(&str) -> Result<(), E>,
     {
@@ -345,12 +363,30 @@ impl LanguageIdentifier {
         Ok(())
     }
 
-    pub(crate) fn write_to_lowercase<W: core::fmt::Write + ?Sized>(
+    /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
+    /// lowercase ascii chars.
+    ///
+    /// The default canonicalization of language identifiers uses titlecase scripts and uppercase
+    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
+    ///
+    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
+    /// ordered by the separators, alphabetically._
+    ///
+    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
+    /// canonicalization of the language identifier.
+    ///
+    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
+    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
+    /// but titlecased and uppercased outside T extensions respectively.
+    ///
+    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
+    /// [`Transform extensions`]: crate::extensions::transform
+    pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
         &self,
         sink: &mut W,
     ) -> core::fmt::Result {
         let mut initial = true;
-        self.for_each_subtag_str_lowercase(&mut |subtag| {
+        self.for_each_subtag_str_lowercased(&mut |subtag| {
             if initial {
                 initial = false;
             } else {