Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Number formatting for many languages and sorting (collation) with ICU #1632

Merged
merged 5 commits into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions core/utilities-numbers.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
--
-- Number formatting utilities
-- MIT License (c) 2022 SILE organization
--
local icu = require("justenoughicu")

-- Language-specific number formatters add functions to this table,
-- see e.g. languages/eo.lua
local formatNumber = {
und = {

-- Alpha is a special case (a numbering system, though this table is for
-- formatting style hooks normally)
alpha = function (num)
local out = ""
local a = string.byte("a")
repeat
num = num - 1
out = string.char(num % 26 + a) .. out
num = (num - num % 26) / 26
until num < 1
return out
end

}
}

-- Decent subset from unum.h
local icuStyles = {
default = 0, -- UNUM_PATTERN_DECIMAL
decimal = 1, -- UNUM_DECIMAL
string = 5, -- UNUM_SPELLOUT
ordinal = 6, -- UNUM_ORDINAL
}

-- Numbering system for which we _know_ that ICU doesn't have a
-- default(0) format style rule (i.e. spits out latin)
-- This table is just an optimization to avoid calling ICU twice when this
-- occurs, e.g. "roman" may be quite frequent as a numbering system.
local icuStyleBypass = {
roman = true,
}

local icuFormat = function (num, lang, options)
-- Consistency: further below we'll concatenate those, and an empty
-- string is likely a user mistake.
if not lang and not options.system then
SU.warn("Number formatting needs a language or a numbering system")
return tonumber(num)
end

-- ICU format style (enum)
options.style = not options.style and "default" or options.style
local icustyle = options.style and icuStyles[options.style]
if not icustyle then
SU.warn("Number formatting style is unrecognized (using default as fallback)")
icustyle = 0
end

-- ICU locale: see https://unicode-org.github.io/icu/userguide/locale/
-- Ex. "en", "en-US", "sr-Latn"...
local iculocale = lang or ""
-- ICU keywork for a numbering system specifier: @numbers=xxxx
-- The specifiers are defined here:
-- https://github.com/unicode-org/cldr/blob/main/common/bcp47/number.xml
if options.system then
options.system = options.system:lower()
iculocale = iculocale .. "@numbers=" .. options.system
if icuStyleBypass[options.system] then
icustyle = 1
end
end

local ok, result = pcall(icu.format_number, num, iculocale, icustyle)
if ok and options.system and icustyle == 0 and options.system ~= "latn" and result == tostring(num) then
-- There are valid cases where "@numbers=xxxx" with default(0) and decimal(1) styles both work.
-- Typically, with num=1234
-- "@numbers=arab" in default(0) --> ١٢٣٤
-- "@numbers=arab", in decimal(1) --> ١٬٢٣٤
-- But in many cases, ICU may fallback to latin, e.g. take "roman" (or "grek")
-- "@numbers=roman" in default(0) --> 1234
-- "@numbers=roman" in default(1) --> MCCXXXIV
-- Be user friendly and attempt honoring the script.
ok, result = pcall(icu.format_number, num, "@numbers=" .. options.system, 1)
end
if not ok then
SU.warn("Number formatting failed: " .. tostring(result))
end
return tostring(ok and result or num)
end

setmetatable (formatNumber, {
__call = function (self, num, options, case)
-- Formats a number according to options, and optional case
-- Options:
-- - system: a numbering system string, e.g. "latn" (= "arabic"), "roman", "arab", etc.
-- With the addition of "alpha".
-- Casing is guessed from the system (e.g. roman, Roman, ROMAN) unless specified
-- - style: a format style string, i.e. "default", "decimal", "ordinal", "string")
-- E.g. in English and latin script: 1234 1,234 1,124th one thousand...
-- Possibly extended by additional language-specific formatting rules.
-- Obviously, some combinations of system, style and case won't do anything worth.
if math.abs(num) > 9223372036854775807 then
SU.warn("Integers larger than 64 bits do not reproduce properly in all formats")
end
options = options or {}

-- BEGIN COMPATIBILITY SHIM
if type(options) ~= "table" then
-- It used to be a string aggregating both concepts.
SU.deprecated("Previous syntax of SU.formatNumber", "new syntax for SU.formatNumber", "0.14.6", "0.16.0", [[
Previous syntax was SU.formatNumber(num, format[, case]) with a format string
New syntax is SU.formatNumber(num, options[, case]) with an options table,
possibly containing:
- system: a numbering system string, e.g. "latn" (= "arabic"), "roman", "arab", etc.
With the addition of "alpha".
Casing is taken into account (e.g. roman, Roman, ROMAN) unless specified
- style: a format style string, i.e. "default", "decimal", "ordinal", "string")
E.g. in English and latin script: 1234 1,234 1,124th one thousand...
Possibly extended by additional language-specific formatting rules.
Note that the new syntax doesn't handle casing on the format style, for separation of
concerns.
]])
if not case then
if options:match("^%l") then
case = "lower"
elseif options:match("^.%l") then
case = "title"
else
case = "upper"
end
end
if options:lower() == "nth" then
SU.deprecated("Format 'nth' in SU.formatNumber", "'ordinal' in SU.formatNumber", "0.14.6", "0.16.0")
alerque marked this conversation as resolved.
Show resolved Hide resolved
options = { style = "ordinal" }
elseif options:lower() == "string" then
options = { style = "string" }
elseif options:lower() == "ordinal" and SILE.settings:get("document.language") == "tr" then
SU.deprecated("Format 'ordinal' in Turkish in SU.formatNumber", "'ordinal-string' in SU.formatNumber", "0.14.6", "0.16.0")
options = { style = "ordinal-string" }
else
options = { system = options }
end
end
-- END COMPATIBILITY SHIM

if options.system == "arabic" then
-- "arabic" is the weirdly name, but quite friendly, used e.g. in counters and
-- in several other places, let's keep it as a compatibility alias.
options.system = "latn"
end

local system = options.system
if not case then
if system then
if system:match("^%l") then
case = "lower"
elseif system:match("^.%l") then
case = "title"
else
case = "upper"
end
else
case = "lower"
end
end
system = system and system:lower()

local lang = system and system == "roman" and "la" or SILE.settings:get("document.language")
local style = options.style
local result
if self[lang] and style and type(self[lang][style]) == "function" then
-- Language specific hooks exists, use them...
result = self[lang][style](num, options)
elseif style and type(self["und"][style]) == "function" then
-- Global specific hooks exists: use them...
result = self.und[system](num, options)
elseif system and type(self["und"][system]) == "function" then
-- TRICK: Notably, special case for "alpha"
result = system and self.und[system](num, options)
else
--- Otherwise, rely on ICU...
result = icuFormat(num, lang, options)
end
return icu.case(result, lang, case)
end
})

return formatNumber
36 changes: 36 additions & 0 deletions core/utilities-sorting.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
--
-- Table sorting with language-dependent collation
-- MIT License (c) 2022 SILE organization
--
local icu = require("justenoughicu")

local collatedSort = {
-- No ICU for language "und", fallback to 'natural' table.sort
und = function (t, _)
table.sort(t)
end
}

setmetatable (collatedSort, {
__call = function (self, t, options)
local lang = SILE.settings:get("document.language")
if self[lang] and type(self[lang]) == "function" then
-- Allow overriding ICU for some languages, typically "und"
return self[lang](t, options)
end

if self[lang] and type(self[lang]) == "table" then
-- Allow customizing the default collation options for some languages
options = options or self[lang]
end
-- Be efficient: create the collator once before sorting.
-- I don't think we need to cache it, still.
local collator = icu.collation_create(lang, options or {})
table.sort(t, function (s1, s2)
return icu.compare(collator, s1, s2)
end)
icu.collation_destroy(collator)
end
})

return collatedSort
58 changes: 4 additions & 54 deletions core/utilities.lua
Original file line number Diff line number Diff line change
Expand Up @@ -521,60 +521,6 @@ end
utilities.utf16be_to_utf8 = function (str) return utf16_to_utf8(str, "be") end
utilities.utf16le_to_utf8 = function (str) return utf16_to_utf8(str, "le") end

local icu = require("justenoughicu")

local icuFormat = function (num, format)
local ok, result = pcall(icu.format_number, num, format)
return tostring(ok and result or num)
end

-- Language specific number formatters add functions to this table,
-- see e.g. languages/tr.lua
utilities.formatNumber = {
und = {

alpha = function (num)
local out = ""
local a = string.byte("a")
repeat
num = num - 1
out = string.char(num % 26 + a) .. out
num = (num - num % 26) / 26
until num < 1
return out
end

}
}

setmetatable (utilities.formatNumber, {
__call = function (self, num, format, case)
if math.abs(num) > 9223372036854775807 then
SU.warn("Integers larger than 64 bits do not reproduce properly in all formats")
end
if not case then
if format:match("^%l") then
case = "lower"
elseif format:match("^.%l") then
case = "title"
else
case = "upper"
end
end
local lang = format:match("[Rr][Oo][Mm][Aa][Nn]") and "la" or SILE.settings:get("document.language")
format = format:lower()
local result
if self[lang] and type(self[lang][format]) == "function" then
result = self[lang][format](num)
elseif type(self["und"][format]) == "function" then
result = self.und[format](num)
else
result = icuFormat(num, format)
end
return icu.case(result, lang, case)
end
})

utilities.breadcrumbs = function ()
local breadcrumbs = {}

Expand Down Expand Up @@ -609,4 +555,8 @@ utilities.breadcrumbs = function ()
return breadcrumbs
end

utilities.formatNumber = require("core.utilities-numbers")

utilities.collatedSort = require("core.utilities-sorting")

return utilities
10 changes: 5 additions & 5 deletions languages/en.lua
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ SILE.hyphenator.languages["en"].exceptions = {"as-so-ciate", "as-so-ciates",
"ref-or-ma-tion", "ret-ri-bu-tion", "ta-ble"}

-- Internationalisation stuff
local en_nth = function (num)
local en_ordinal = function (num)
local mod100, mod10 = num % 100, num % 10
if mod100 > 3 and mod100 < 21 then return "th" end
if mod10 == 1 then return "st" end
Expand Down Expand Up @@ -644,14 +644,14 @@ local en_string = function (num)
vword = vword:gsub("ten " .. v, twords[i])
end

return num == 0 and "zero" or vword
return num == 0 and "zero" or vword:sub(1, -2)
end

SU.formatNumber.en = {
string = function (num)
string = function (num, _)
return en_string(num)
end,
nth = function (num)
return num .. '’' .. en_nth(num)
ordinal = function (num, _)
return num .. '’' .. en_ordinal(num)
end
}
8 changes: 4 additions & 4 deletions languages/eo.lua

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions languages/tr.lua
Original file line number Diff line number Diff line change
Expand Up @@ -756,13 +756,13 @@ local tr_nums = function (num, ordinal)
end

SU.formatNumber.tr = {
string = function (num)
string = function (num, _)
return tr_nums(num, false)
end,
ordinal = function (num)
['ordinal-string'] = function (num, _)
return tr_nums(num, true)
end,
nth = function (num)
ordinal = function (num, _)
return num .. "."
end
}
4 changes: 2 additions & 2 deletions packages/counters/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ local function getMultilevelCounter (_, id)
end

function package.formatCounter (_, counter)
return SU.formatNumber(counter.value, counter.display)
return SU.formatNumber(counter.value, { system = counter.display })
end

function package:formatMultilevelCounter (counter, options)
Expand Down Expand Up @@ -219,7 +219,7 @@ The available built-in display types are:
\item{\code{alpha}, for lower-case alphabetic counting;}
\item{\code{Alpha}, for upper-case alphabetic counting;}
\item{\code{roman}, for lower-case Roman numerals; and,}
\item{\code{Roman} for upper-case Roman numerals.}
\item{\code{ROMAN} for upper-case Roman numerals.}
\end{itemize}

The ICU library also provides ways of formatting numbers in global (non-Latin) scripts.
Expand Down
Loading