Skip to content

Commit

Permalink
Canonicalize tlangs to lowercase (#4134)
Browse files Browse the repository at this point in the history
  • Loading branch information
jedel1043 authored Oct 14, 2023
1 parent 871c238 commit c8f45ed
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 25 deletions.
2 changes: 1 addition & 1 deletion components/locid/src/extensions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
//! use icu::locid::extensions::unicode::{Key, Value};
//! use icu::locid::Locale;
//!
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-US-h0-hybrid-x-foo"
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
//! .parse()
//! .expect("Failed to parse.");
//!
Expand Down
14 changes: 7 additions & 7 deletions components/locid/src/extensions/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
//! use icu::locid::{LanguageIdentifier, Locale};
//!
//! let mut loc: Locale =
//! "en-US-t-es-AR-h0-hybrid".parse().expect("Parsing failed.");
//! "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed.");
//!
//! let lang: LanguageIdentifier =
//! "es-AR".parse().expect("Parsing LanguageIdentifier failed.");
Expand All @@ -28,7 +28,7 @@
//! assert!(loc.extensions.transform.fields.contains_key(&key));
//! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
//!
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-AR-h0-hybrid");
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid");
//! ```
mod fields;
mod key;
Expand Down Expand Up @@ -60,7 +60,7 @@ use litemap::LiteMap;
/// use icu::locid::{LanguageIdentifier, Locale};
///
/// let mut loc: Locale =
/// "de-t-en-US-h0-hybrid".parse().expect("Parsing failed.");
/// "de-t-en-us-h0-hybrid".parse().expect("Parsing failed.");
///
/// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed.");
///
Expand Down Expand Up @@ -107,7 +107,7 @@ impl Transform {
/// ```
/// use icu::locid::Locale;
///
/// let mut loc: Locale = "en-US-t-es-AR".parse().expect("Parsing failed.");
/// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.transform.is_empty());
/// ```
Expand All @@ -122,7 +122,7 @@ impl Transform {
/// ```
/// use icu::locid::Locale;
///
/// let mut loc: Locale = "en-US-t-es-AR".parse().unwrap();
/// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap();
/// loc.extensions.transform.clear();
/// assert_eq!(loc, "en-US".parse().unwrap());
/// ```
Expand Down Expand Up @@ -196,7 +196,7 @@ impl Transform {
}
f("t")?;
if let Some(lang) = &self.lang {
lang.for_each_subtag_str(f)?;
lang.for_each_subtag_str_lowercased(f)?;
}
self.fields.for_each_subtag_str(f)
}
Expand All @@ -212,7 +212,7 @@ impl writeable::Writeable for Transform {
sink.write_str("t")?;
if let Some(lang) = &self.lang {
sink.write_char('-')?;
writeable::Writeable::write_to(lang, sink)?;
lang.write_lowercased_to(sink)?;
}
if !self.fields.is_empty() {
sink.write_char('-')?;
Expand Down
68 changes: 68 additions & 0 deletions components/locid/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,74 @@ impl LanguageIdentifier {
}
Ok(())
}

/// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
/// lowercase ascii form.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.into_tinystr().to_ascii_lowercase().as_str())?;
}
if let Some(ref region) = self.region {
f(region.into_tinystr().to_ascii_lowercase().as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}

/// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
/// lowercase ascii chars.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
&self,
sink: &mut W,
) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str_lowercased(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
}

impl AsRef<LanguageIdentifier> for LanguageIdentifier {
Expand Down
50 changes: 50 additions & 0 deletions components/locid/tests/fixtures/canonicalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,55 @@
{
"input": "en-scouse-fonipa",
"output": "en-fonipa-scouse"
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-AR-x-foo"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-t-en-Latn-CA-emodeng"
},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-US-T-ES-AR-X-FOO"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-T-EN-LATN-CA-EMODENG"
},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "UND-CYRL-T-ES-LATN-M0-UNGEGN"
},
"output": {
"type": "Locale",
"identifier": "und-Cyrl-t-es-latn-m0-ungegn"
}
}
]
6 changes: 3 additions & 3 deletions components/locid/tests/fixtures/locale.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-t-pl-Latn-DE"
"identifier": "en-US-t-pl-latn-de"
},
"output": {
"type": "Locale",
Expand Down Expand Up @@ -125,7 +125,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-AR-x-foo"
"identifier": "en-US-t-es-ar-x-foo"
},
"output": {
"type": "Locale",
Expand All @@ -142,7 +142,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-u-ca-buddhist-hc-h12-t-es-AR-h0-hybrid-x-private-foobar"
"identifier": "en-US-u-ca-buddhist-hc-h12-t-es-ar-h0-hybrid-x-private-foobar"
},
"output": {
"type": "Locale",
Expand Down
8 changes: 4 additions & 4 deletions components/locid/tests/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ fn test_locale_conversions() {

#[test]
fn test_locale_canonicalize() {
let locale: Locale = "En-latn-US-MacOS"
.parse()
.expect("Failed to parse a locale.");
assert_writeable_eq!(locale, Locale::canonicalize("eN-latN-uS-macOS").unwrap());
let path = "./tests/fixtures/canonicalize.json";
let data = helpers::read_fixture(path).expect("Failed to read a fixture");

test_langid_fixtures(data);
}

#[test]
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions provider/datagen/tests/data/postcard/fingerprints.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2063,20 +2063,20 @@ time_zone/specific_short@1, sr-Latn, 119B, dcdb9855b7df4f90
time_zone/specific_short@1, th, 31B, 4b7af6a019fab889
time_zone/specific_short@1, tr, 31B, 4b7af6a019fab889
time_zone/specific_short@1, und, 31B, 4b7af6a019fab889
transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 373dc989a6f2feb0
transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 7504fd6441311f8b
transliterator/rules@1, und+el-Latn-t-el-m0-bgn, 13802B, 676fd7d03e5f65ba
transliterator/rules@1, und+und-Arab-t-s0-intindic, 24093B, bb464298570e790f
transliterator/rules@1, und+und-Arab-t-und-Beng, 320B, 6910c1371f9ed898
transliterator/rules@1, und+und-Arab-t-und-beng, 320B, 740b227a94b146ff
transliterator/rules@1, und+und-Latn-t-s0-ascii, 109B, c4235cb150e12966
transliterator/rules@1, und+und-t-d0-publish, 3476B, 8b78371a1427663b
transliterator/rules@1, und+und-t-s0-publish, 1342B, fc819d57a6653613
transliterator/rules@1, und+und-t-und-Beng-d0-intindic, 2620B, 5d7d726babccafe7
transliterator/rules@1, und+und-t-und-Latn-d0-ascii, 27083B, 5098d1af741181a3
transliterator/rules@1, und+und-t-und-beng-d0-intindic, 2620B, 5d7d726babccafe7
transliterator/rules@1, und+und-t-und-d0-test-m0-cursfilt-s0-test, 92B, a3f0d5ed65cba360
transliterator/rules@1, und+und-t-und-d0-test-m0-emtymach-s0-test, 104B, fd8a17f7ffe5a325
transliterator/rules@1, und+und-t-und-d0-test-m0-hexrust-s0-test, 77B, bd697bfcd06ad4ca
transliterator/rules@1, und+und-t-und-d0-test-m0-hexuni-s0-test, 80B, 55d96425b75e5ac8
transliterator/rules@1, und+und-t-und-d0-test-m0-niels-s0-test, 1769B, 45400449cf43ecf6
transliterator/rules@1, und+und-t-und-d0-test-m0-rectesta-s0-test, 369B, 69c41d4b5c828833
transliterator/rules@1, und+und-t-und-d0-test-m0-rectestr-s0-test, 237B, 3345ed066cbb729f
transliterator/rules@1, und+und-t-und-latn-d0-ascii, 27083B, 5098d1af741181a3
units/constants@1, und, 426B, e0c7eeb9e702371c

0 comments on commit c8f45ed

Please sign in to comment.