Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Canonicalize tlangs to lowercase #4134

Merged
merged 4 commits into from
Oct 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/locid/src/extensions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
//! use icu::locid::extensions::unicode::{Key, Value};
//! use icu::locid::Locale;
//!
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-US-h0-hybrid-x-foo"
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
//! .parse()
//! .expect("Failed to parse.");
//!
Expand Down
14 changes: 7 additions & 7 deletions components/locid/src/extensions/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
//! use icu::locid::{LanguageIdentifier, Locale};
//!
//! let mut loc: Locale =
//! "en-US-t-es-AR-h0-hybrid".parse().expect("Parsing failed.");
//! "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed.");
//!
//! let lang: LanguageIdentifier =
//! "es-AR".parse().expect("Parsing LanguageIdentifier failed.");
Expand All @@ -28,7 +28,7 @@
//! assert!(loc.extensions.transform.fields.contains_key(&key));
//! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
//!
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-AR-h0-hybrid");
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid");
//! ```
mod fields;
mod key;
Expand Down Expand Up @@ -60,7 +60,7 @@ use litemap::LiteMap;
/// use icu::locid::{LanguageIdentifier, Locale};
///
/// let mut loc: Locale =
/// "de-t-en-US-h0-hybrid".parse().expect("Parsing failed.");
/// "de-t-en-us-h0-hybrid".parse().expect("Parsing failed.");
///
/// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed.");
///
Expand Down Expand Up @@ -107,7 +107,7 @@ impl Transform {
/// ```
/// use icu::locid::Locale;
///
/// let mut loc: Locale = "en-US-t-es-AR".parse().expect("Parsing failed.");
/// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.transform.is_empty());
/// ```
Expand All @@ -122,7 +122,7 @@ impl Transform {
/// ```
/// use icu::locid::Locale;
///
/// let mut loc: Locale = "en-US-t-es-AR".parse().unwrap();
/// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap();
/// loc.extensions.transform.clear();
/// assert_eq!(loc, "en-US".parse().unwrap());
/// ```
Expand Down Expand Up @@ -196,7 +196,7 @@ impl Transform {
}
f("t")?;
if let Some(lang) = &self.lang {
lang.for_each_subtag_str(f)?;
lang.for_each_subtag_str_lowercased(f)?;
}
self.fields.for_each_subtag_str(f)
}
Expand All @@ -212,7 +212,7 @@ impl writeable::Writeable for Transform {
sink.write_str("t")?;
if let Some(lang) = &self.lang {
sink.write_char('-')?;
writeable::Writeable::write_to(lang, sink)?;
lang.write_lowercased_to(sink)?;
}
if !self.fields.is_empty() {
sink.write_char('-')?;
Expand Down
68 changes: 68 additions & 0 deletions components/locid/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,74 @@ impl LanguageIdentifier {
}
Ok(())
}

/// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
/// lowercase ascii form.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.into_tinystr().to_ascii_lowercase().as_str())?;
}
if let Some(ref region) = self.region {
f(region.into_tinystr().to_ascii_lowercase().as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}

/// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
/// lowercase ascii chars.
///
/// The default canonicalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// canonicalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
&self,
sink: &mut W,
) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str_lowercased(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
}

impl AsRef<LanguageIdentifier> for LanguageIdentifier {
Expand Down
50 changes: 50 additions & 0 deletions components/locid/tests/fixtures/canonicalize.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,55 @@
{
"input": "en-scouse-fonipa",
"output": "en-fonipa-scouse"
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-AR-x-foo"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-t-en-Latn-CA-emodeng"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please test uppercase variants

},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-US-T-ES-AR-X-FOO"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-T-EN-LATN-CA-EMODENG"
},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "UND-CYRL-T-ES-LATN-M0-UNGEGN"
},
"output": {
"type": "Locale",
"identifier": "und-Cyrl-t-es-latn-m0-ungegn"
}
}
]
6 changes: 3 additions & 3 deletions components/locid/tests/fixtures/locale.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-t-pl-Latn-DE"
"identifier": "en-US-t-pl-latn-de"
},
"output": {
"type": "Locale",
Expand Down Expand Up @@ -125,7 +125,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-AR-x-foo"
"identifier": "en-US-t-es-ar-x-foo"
},
"output": {
"type": "Locale",
Expand All @@ -142,7 +142,7 @@
{
"input": {
"type": "Locale",
"identifier": "en-US-u-ca-buddhist-hc-h12-t-es-AR-h0-hybrid-x-private-foobar"
"identifier": "en-US-u-ca-buddhist-hc-h12-t-es-ar-h0-hybrid-x-private-foobar"
},
"output": {
"type": "Locale",
Expand Down
8 changes: 4 additions & 4 deletions components/locid/tests/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ fn test_locale_conversions() {

#[test]
fn test_locale_canonicalize() {
let locale: Locale = "En-latn-US-MacOS"
.parse()
.expect("Failed to parse a locale.");
assert_writeable_eq!(locale, Locale::canonicalize("eN-latN-uS-macOS").unwrap());
let path = "./tests/fixtures/canonicalize.json";
let data = helpers::read_fixture(path).expect("Failed to read a fixture");

test_langid_fixtures(data);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for turning this into data-driven test!

}

#[test]
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions provider/datagen/tests/data/postcard/fingerprints.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2063,20 +2063,20 @@ time_zone/specific_short@1, sr-Latn, 119B, dcdb9855b7df4f90
time_zone/specific_short@1, th, 31B, 4b7af6a019fab889
time_zone/specific_short@1, tr, 31B, 4b7af6a019fab889
time_zone/specific_short@1, und, 31B, 4b7af6a019fab889
transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 373dc989a6f2feb0
transliterator/rules@1, und+de-t-de-d0-ascii, 16754B, 7504fd6441311f8b
transliterator/rules@1, und+el-Latn-t-el-m0-bgn, 13802B, 676fd7d03e5f65ba
transliterator/rules@1, und+und-Arab-t-s0-intindic, 24093B, bb464298570e790f
transliterator/rules@1, und+und-Arab-t-und-Beng, 320B, 6910c1371f9ed898
transliterator/rules@1, und+und-Arab-t-und-beng, 320B, 740b227a94b146ff
transliterator/rules@1, und+und-Latn-t-s0-ascii, 109B, c4235cb150e12966
transliterator/rules@1, und+und-t-d0-publish, 3476B, 8b78371a1427663b
transliterator/rules@1, und+und-t-s0-publish, 1342B, fc819d57a6653613
transliterator/rules@1, und+und-t-und-Beng-d0-intindic, 2620B, 5d7d726babccafe7
transliterator/rules@1, und+und-t-und-Latn-d0-ascii, 27083B, 5098d1af741181a3
transliterator/rules@1, und+und-t-und-beng-d0-intindic, 2620B, 5d7d726babccafe7
transliterator/rules@1, und+und-t-und-d0-test-m0-cursfilt-s0-test, 92B, a3f0d5ed65cba360
transliterator/rules@1, und+und-t-und-d0-test-m0-emtymach-s0-test, 104B, fd8a17f7ffe5a325
transliterator/rules@1, und+und-t-und-d0-test-m0-hexrust-s0-test, 77B, bd697bfcd06ad4ca
transliterator/rules@1, und+und-t-und-d0-test-m0-hexuni-s0-test, 80B, 55d96425b75e5ac8
transliterator/rules@1, und+und-t-und-d0-test-m0-niels-s0-test, 1769B, 45400449cf43ecf6
transliterator/rules@1, und+und-t-und-d0-test-m0-rectesta-s0-test, 369B, 69c41d4b5c828833
transliterator/rules@1, und+und-t-und-d0-test-m0-rectestr-s0-test, 237B, 3345ed066cbb729f
transliterator/rules@1, und+und-t-und-latn-d0-ascii, 27083B, 5098d1af741181a3
units/constants@1, und, 426B, e0c7eeb9e702371c
Loading