Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSL support #2082

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,460 changes: 1,460 additions & 0 deletions csl/core/engine.lua

Large diffs are not rendered by default.

246 changes: 246 additions & 0 deletions csl/core/locale.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
--- Reader for CSL 1.0.2 locale files
--
-- @copyright License: MIT (c) 2024 Omikhleia
--
-- Public API:
-- - (static method) CslLocale.parse(doc) -> CslLocale
-- - (static method) CslLocale.read(filename) -> CslLocale
-- - CslLocale:date(form) -> table<cs:date-parts>
-- - CslLocale:term(name, form?, plural?) -> string, gender
-- - CslLocale:ordinal(number, form?, gender-form?, plural?) -> string
-- - CslLocale:case(text, textCase) -> string
--

local casing = require("csl.core.utils.casing")

local xmlparser = require("csl.core.utils.xmlparser")
local parse = xmlparser.parse
local rules = {
prefix = "cs:",
skipEmptyStrings = true,
preserveEmptyStrings = {},
stripSpaces = true,
preserveSpaces = { text = true, title = true, id = true, term = true },
}

local CslLocale = pl.class()

function CslLocale:_init (tree)
self.terms = {}
self.dates = {}
self.styleOptions = {}
self:_preprocess(tree)
end

-- Store items from the syntax tree in more convenient structures and maps
function CslLocale:_preprocess (tree)
self.lang = tree.options["xml:lang"]

for _, content in ipairs(tree) do
if content.command == "cs:terms" then
for _, term in ipairs(content) do
if term.command == "cs:term" then
local name = term.options.name
if not name then
SU.error("CSL locale term without name")
end
local form = term.options.form or "long"
-- gender-form is only used for ordinal terms, but it's simpler
-- to just store it for all terms and have a consistent internal
-- representation
local genderf = term.options["gender-form"] or "neuter"

self.terms[name] = self.terms[name] or {}
self.terms[name][form] = self.terms[name][form] or {}
-- Whole term (not sub-content) for its attributes
self.terms[name][form][genderf] = term
end
end
elseif content.command == "cs:style-options" then
self.styleOptions = content.options
elseif content.command == "cs:date" then
local form = content.options.form
if not form then
SU.error("CSL locale date without form")
end
-- extract the cs:date-part sub-content
self.dates[form] = SU.ast.subContent(content)
end
end
end

function CslLocale:_termvalue (term) -- luacheck: no unused args
return term[1]
end

function CslLocale:_lookupTerm (name, form, genderf)
local t = self.terms[name]
if not t then
return nil
end
form = form or "long"
local f = t[form]
if not f then
-- If not found, check for form fallbacks
if form == "long" then
return nil -- (No fallback)
end
if form == "verb-short" then
form = "verb"
elseif form == "symbol" then
form = "short"
elseif form == "verb" or form == "short" then
form = "long"
end
return self:_lookupTerm(name, form, genderf)
end
genderf = genderf or "neuter"
local g = f[genderf]
if not g then
if genderf == "neuter" then
return nil -- (No fallback)
end
return self:_lookupTerm(name, form, "neuter")
end
SU.debug("csl", "Lookup term", name, form, genderf)
return g
end

function CslLocale:_lookupShortOrdinal (number, genderf)
SU.debug("csl", "Lookup short-ordinal", number, genderf)
number = tonumber(number)
if not number then
SU.error("CSL ordinal term requires a number")
end

-- Case 0-9
if number < 10 then
local name = ("ordinal-%02d"):format(number)
local term = self:_lookupTerm(name, "long", genderf)
if term then -- direct match on 0-9
return term
end
return self:_lookupTerm("ordinal", "long", genderf)
end
-- Case 10-99
if number < 100 then
local name = ("ordinal-%02d"):format(number)
local term = self:_lookupTerm(name, "long", genderf)
if term then
return term
end
-- No direct match, try to match the last digit
local lastDigit = number % 10
local nameLastDigit = ("ordinal-%02d"):format(lastDigit)
local termLastDigit = self:_lookupTerm(nameLastDigit, "long", genderf)
if termLastDigit and termLastDigit.match ~= "whole-number" then
return termLastDigit
end
return self:_lookupTerm("ordinal", "long", genderf)
end
-- TODO FIXME: CSL specs do define rules for larger numbers, but is this really useful?
-- Not bothering for now!
SU.error("CSL ordinal beyond currently supported range")
end

-- PUBLIC METHODS

--- Lookup a date format in the locale.
-- @tparam string form The form of the date ('numeric' or 'text')
-- @treturn table The date format as a table of cs:date-parts
function CslLocale:date (form)
local d = self.dates[form]
if not d then
SU.error("CSL locale date format not found: " .. tostring(form))
end
return d
end
Omikhleia marked this conversation as resolved.
Show resolved Hide resolved

--- Lookup a term in the locale.
-- Reserved for non-ordinal terms.
-- @tparam string name The name of the term
-- @tparam string form The form of the term (default: "long")
-- @tparam boolean plural Whether to return the plural form (default: false)
-- @treturn string,string The term (or empty string), and the gender or the term (or nil)
function CslLocale:term (name, form, plural)
local term = self:_lookupTerm(name, form)
if not term then
return nil
end
if type(term[1]) == "string" then
return self:_termvalue(term), term.options.gender
end
local sgpl = SU.ast.findInTree(term, plural and "cs:multiple" or "cs:single")
if not sgpl then
pl.pretty.dump(term)
return SU.error("CSL term error for singular/multiple: " .. name)
end
return self:_termvalue(sgpl), term.options.gender
end

--- Lookup an ordinal term in the locale.
-- Reserved for ordinal terms.
-- @tparam number number The numeric value to be formatted
-- @tparam string name The name of the term
-- @tparam string form The form of the term (default: "short")
-- @tparam string genderf The gender-form of the term (default: "neuter")
-- @tparam boolean plural Whether to return the plural form (default: false)
function CslLocale:ordinal (number, form, genderf, plural)
if form == "long" then
-- TODO FIXME: Not sure this is widely used, not bothering for now
SU.warn("CSL long-ordinal term not implemented, fallback to short ordinals")
end
local term = self:_lookupShortOrdinal(number, genderf)
if not term then
SU.error("CSL ordinal term not found for ordinal: " .. tostring(number))
end
if type(term[1]) == "string" then
return number .. self:_termvalue(term)
end
local sgpl = SU.ast.findInTree(term, plural and "cs:plural" or "cs:single")
if not sgpl then
SU.error("CSL ordinal term not found for ordinal: " .. tostring(number))
end
return number .. self:_termvalue(sgpl)
end

--- Apply a text case transformation.
-- @tparam string text Text to transform
-- @tparam string textCase CSL case transformation
-- @treturn string The transformed text
function CslLocale:case (text, textCase)
local lang = self.lang
if not casing[textCase] then
SU.warn("CSL locale case not found: " .. textCase)
return text
end
return casing[textCase](text, lang)
end

--- Parse a CSL locale file (static method).
-- @tparam string doc The CSL locale file content
-- @treturn CslLocale The locale object (or nil, error message on failure)
function CslLocale.parse (doc)
local tree, err = parse(doc, rules)
if not tree then
return nil, err
end
return CslLocale(tree)
end

--- Read a CSL locale file (static method).
-- @tparam string filename The resolved filename of the locale file
-- @treturn CslLocale The locale object (or nil, error message on failure)
function CslLocale.read (filename)
local file, err = io.open(filename)
if not file then
return nil, err
end
local doc = file:read("*a")
file:close()
return CslLocale.parse(doc)
end

return {
CslLocale = CslLocale,
}
100 changes: 100 additions & 0 deletions csl/core/style.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
--- Reader for CSL 1.0.2 locale files
--
-- @copyright License: MIT (c) 2024 Omikhleia
--
-- Public API:
-- - (static method) CslStyle.parse(doc) -> CslStyle
-- - (static method) CslStyle.read(filename) -> CslStyle
--

local xmlparser = require("csl.core.utils.xmlparser")
local parse = xmlparser.parse
local rules = {
prefix = "cs:",
skipEmptyStrings = true,
preserveEmptyStrings = {},
stripSpaces = true,
preserveSpaces = { text = true, title = true, id = true, term = true },
}

local CslStyle = pl.class()

function CslStyle:_init (tree)
self.macros = {}
self.locales = {}
self.bibliography = nil
self.citation = nil
self.globalOptions = {}
self:_preprocess(tree)
end

-- Store items from the syntax tree in more convenient structures and maps
function CslStyle:_preprocess (tree)
-- Global options and inheritable name options
self.globalOptions = tree.options

-- Extract macros, locale overrides, citation and bibliography
for _, content in ipairs(tree) do
if content.command == "cs:macro" then
local name = content.options and content.options.name
if not name then
SU.error("CSL macro without name")
end
if self.macros[name] then
SU.warn("CSL macro " .. name .. " has multiple definitions, using the last one")
end
self.macros[name] = SU.ast.subContent(content)
elseif content.command == "cs:locale" then
local lang = content.options and content.options["xml:lang"]
if not lang then
SU.error("CSL locale without xml:lang")
end
if self.locales[lang] then
SU.warn("CSL locale " .. lang .. " has multiple definitions, using the last one")
end
-- Don't subcontent, so we have full locales here (overrides)
self.locales[lang] = content
elseif content.command == "cs:citation" then
if self.citation then
SU.warn("CSL has multiple citation definitions, using the last one")
end
-- Don't subContent, we want to keep the whole citation options (attributes)
self.citation = content
elseif content.command == "cs:bibliography" then
if self.bibliography then
SU.warn("CSL has multiple bibliography definitions, using the last one")
end
-- Don't subContent, we want to keep the whole bibliography options (attributes)
self.bibliography = content
end
-- We can ignore cs:info and don't expect other top-level elements
end
end

--- Parse a CSL style document (static method).
-- @tparam string doc The CSL style document
-- @treturn Csl The parsed CSL style object (or nil, error message on failure)
function CslStyle.parse (doc)
local tree, err = parse(doc, rules)
if not tree then
return nil, err
end
return CslStyle(tree)
end

--- Read a CSL style file (static method).
-- @tparam string filename The resolved filename of the CSL style file
-- @treturn Csl The parsed CSL style object (or nil, error message on failure)
function CslStyle.read (filename)
local file, err = io.open(filename)
if not file then
return nil, err
end
local doc = file:read("*a")
file:close()
return CslStyle.parse(doc)
end

return {
CslStyle = CslStyle,
}
45 changes: 45 additions & 0 deletions csl/core/utils/casing.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
--- Casing functions for CSL locales.
--
-- @copyright License: MIT (c) 2024 Omikhleia
--
-- Objectives: provide functions to handle text casing in CSL locales.
--

local icu = require("justenoughicu")
-- N.B. We don't use the textcase package here:
-- The language is a BCP47 identifier from the CSL locale.

local capitalizeFirst = function (text, lang)
local first = luautf8.sub(text, 1, 1)
local rest = luautf8.sub(text, 2)
return icu.case(first, lang, "upper") .. rest
end

--- Text casing methods for CSL.
-- @table casing methods for lower, upper, capitalize-first, capitalize-all, title, sentence
local casing = {
-- Straightforward
["lowercase"] = function (text, lang)
return icu.case(text, lang, "lower")
end,
["uppercase"] = function (text, lang)
return icu.case(text, lang, "upper")
end,
["capitalize-first"] = capitalizeFirst,

-- Opinionated: even ICU does not really handle this well.
-- It does not have good support for exceptions (small words, prepositions,
-- articles), etc. in most languages
-- So fallback to capitalize-first.
["capitalize-all"] = capitalizeFirst,
["title"] = capitalizeFirst,

-- Deprecated.
-- Let's not bother with it.
["sentence"] = function (text, _)
SU.warn("Sentence case is deprecated in CSL 1.0.x (ignored)")
return text
end,
}

return casing
Loading
Loading