Here’s a bibtex replacement in 400 lines of Lua.

sile-typesetter · Apr 5, 2015 · 682bbc5 · 682bbc5
1 parent 541a22d
commit 682bbc5
Show file tree

Hide file tree

Showing 3 changed files with 416 additions and 2 deletions.
diff --git a/packages/bibliography.lua b/packages/bibliography.lua
@@ -0,0 +1,392 @@
+std = require("std")
+
+local function find_outside_braces(s, pat, i)
+  local len = string.len(s)
+  local j, k = string.find(s, pat, i)
+  if not j then return j, k end
+  local jb, kb = string.find(s, '%b{}', i)
+  while jb and jb < j do --- scan past braces
+    --- braces come first, so we search again after close brace
+    local i2 = kb + 1
+    j, k = string.find(s, pat, i2)
+    if not j then return j, k end
+    jb, kb = string.find(s, '%b{}', i2)
+  end
+  -- either pat precedes braces or there are no braces
+  return string.find(s, pat, j) --- 2nd call needed to get captures
+end
+
+local function split(s, pat, find) --- return list of substrings separated by pat
+  find = find or string.find -- could be find_outside_braces
+  local len = string.len(s)
+  local t = { }
+  local insert = table.insert
+  local i, j, k = 1, true
+  while j and i <= len + 1 do
+    j, k = find(s, pat, i)
+    if j then
+      insert(t, string.sub(s, i, j-1))
+      i = k + 1
+    else
+      insert(t, string.sub(s, i))
+    end
+  end
+  return t
+end
+
+local function splitters(s, pat, find) --- return list of separators
+  find = find or string.find -- could be find_outside_braces
+  local t = { }
+  local insert = table.insert
+  local j, k = find(s, pat, 1)
+  while j do
+    insert(t, string.sub(s, j, k))
+    j, k = find(s, pat, k+1)
+  end
+  return t
+end
+
+local function namesplit(s)
+  local t = split(s, '%s+[aA][nN][dD]%s+', find_outside_braces)
+  local i = 2
+  while i <= #t do
+    while string.find(t[i], '^[aA][nN][dD]%s+') do
+      t[i] = string.gsub(t[i], '^[aA][nN][dD]%s+', '')
+      table.insert(t, i, '')
+      i = i + 1
+    end
+    i = i + 1
+  end
+  return t
+end
+
+local sep_and_not_tie = '%-'
+local sep_chars = sep_and_not_tie .. '%~'
+
+local parse_name
+do
+  local white_sep         = '[' .. sep_chars .. '%s]+'
+  local white_comma_sep   = '[' .. sep_chars .. '%s%,]+'
+  local trailing_commas   = '(,[' .. sep_chars .. '%s%,]*)$'
+  local sep_char          = '[' .. sep_chars .. ']'
+  local leading_white_sep = '^' .. white_sep
+
+  -- <name-parsing utilities>=
+  function isVon(s)
+    local lower  = find_outside_braces(s, '%l') -- first nonbrace lowercase
+    local letter = find_outside_braces(s, '%a') -- first nonbrace letter
+    local bs, ebs, command = find_outside_braces(s, '%{%\\(%a+)') -- \xxx
+    if lower and lower <= letter and lower <= (bs or lower) then
+      return true
+    elseif letter and letter <= (bs or letter) then
+      return false
+    elseif bs then
+      if upper_specials[command] then
+        return false
+      elseif lower_specials[command] then
+        return true
+      else
+        local close_brace = find_outside_braces(s, '%}', ebs+1)
+        lower  = find(s, '%l') -- first nonbrace lowercase
+        letter = find(s, '%a') -- first nonbrace letter
+        return lower and lower <= letter
+      end
+    else
+      return false
+    end
+  end
+
+  function parse_name(s, inter_token)
+    if string.find(s, trailing_commas) then
+      biberrorf("Name '%s' has one or more commas at the end", s)
+    end
+    s = string.gsub(s, trailing_commas, '')
+    s = string.gsub(s, leading_white_sep, '')
+    local tokens = split(s, white_comma_sep, find_outside_braces)
+    local trailers = splitters(s, white_comma_sep, find_outside_braces)
+    -- The string separating tokens is reduced to a single
+    -- ``separator character.'' A comma always trumps other
+    -- separator characters. Otherwise, if there's no comma,
+    -- we take the first character, be it a separator or a
+    -- space. (Patashnik considers that multiple such
+    -- characters constitute ``silliness'' on the user's
+    -- part.)
+    -- <rewrite [[trailers]] to hold a single separator character each>=
+    for i = 1, #trailers do
+      local s = trailers[i]
+      assert(string.len(s) > 0)
+      if string.find(s, ',') then
+        trailers[i] = ','
+      else
+        trailers[i] = string.sub(s, 1, 1)
+      end
+    end
+    local commas = { } --- maps each comma to index of token the follows it
+    for i, t in ipairs(trailers) do
+      string.gsub(t, ',', function() table.insert(commas, i+1) end)
+    end
+    local name = { }
+    -- A name has up to four parts: the most general form is
+    -- either ``First von Last, Junior'' or ``von Last,
+    -- First, Junior'', but various vons and Juniors can be
+    -- omitted. The name-parsing algorithm is baroque and is
+    -- transliterated from the original BibTeX source, but
+    -- the principle is clear: assign the full version of
+    -- each part to the four fields [[ff]], [[vv]], [[ll]],
+    -- and [[jj]]; and assign an abbreviated version of each
+    -- part to the fields [[f]], [[v]], [[l]], and [[j]].
+    -- <parse the name tokens and set fields of [[name]]>=
+    local first_start, first_lim, last_lim, von_start, von_lim, jr_lim
+      -- variables mark subsequences; if start == lim, sequence is empty
+    local n = #tokens
+    -- The von name, if any, goes from the first von token to
+    -- the last von token, except the last name is entitled
+    -- to at least one token. So to find the limit of the von
+    -- name, we start just before the last token and wind
+    -- down until we find a von token or we hit the von start
+    -- (in which latter case there is no von name).
+    -- <local parsing functions>=
+    function divide_von_from_last()
+      von_lim = last_lim - 1;
+      while von_lim > von_start and not isVon(tokens[von_lim-1]) do
+        von_lim = von_lim - 1
+      end
+    end
+
+    local commacount = #commas
+    if commacount == 0 then -- first von last jr
+      von_start, first_start, last_lim, jr_lim = 1, 1, n+1, n+1
+      -- OK, here's one form.
+      -- 
+      -- <parse first von last jr>=
+      local got_von = false
+      while von_start < last_lim-1 do
+        if isVon(tokens[von_start]) then
+          divide_von_from_last()
+          got_von = true
+          break
+        else
+          von_start = von_start + 1
+        end
+      end
+      if not got_von then -- there is no von name
+        while von_start > 1 and string.find(trailers[von_start - 1], sep_and_not_tie) do
+          von_start = von_start - 1
+        end
+        von_lim = von_start
+      end
+      first_lim = von_start
+    elseif commacount == 1 then -- von last jr, first
+      von_start, last_lim, jr_lim, first_start, first_lim =
+        1, commas[1], commas[1], commas[1], n+1
+      divide_von_from_last()
+    elseif commacount == 2 then -- von last, jr, first
+      von_start, last_lim, jr_lim, first_start, first_lim =
+        1, commas[1], commas[2], commas[2], n+1
+      divide_von_from_last()
+    else
+      biberrorf("Too many commas in name '%s'")
+    end
+    -- <set fields of name based on [[first_start]] and friends>=
+    -- We set long and short forms together; [[ss]] is the
+    -- long form and [[s]] is the short form.
+    -- <definition of function [[set_name]]>=
+    local function set_name(start, lim, long, short)
+      if start < lim then
+        -- string concatenation is quadratic, but names are short
+        -- An abbreviated token is the first letter of a token,
+        -- except again we have to deal with the damned specials.
+        -- <definition of [[abbrev]], for shortening a token>=
+        local function abbrev(token)
+          local first_alpha, _, alpha = string.find(token, '(%a)')
+          local first_brace           = string.find(token, '%{%\\')
+          if first_alpha and first_alpha <= (first_brace or first_alpha) then
+            return alpha
+          elseif first_brace then
+            local i, j, special = string.find(token, '(%b{})', first_brace)
+            if i then
+              return special
+            else -- unbalanced braces
+              return string.sub(token, first_brace)
+            end
+          else
+            return ''
+          end
+        end
+        local ss = tokens[start]
+        local s  = abbrev(tokens[start])
+        for i = start + 1, lim - 1 do
+          if inter_token then
+            ss = ss .. inter_token .. tokens[i]
+            s  = s  .. inter_token .. abbrev(tokens[i])
+          else
+            local ssep, nnext = trailers[i-1], tokens[i]
+            local sep,  next  = ssep,          abbrev(nnext)
+            -- Here is the default for a character between tokens:
+            -- a tie is the default space character between the last
+            -- two tokens of the name part, and between the first two
+            -- tokens if the first token is short enough; otherwise,
+            -- a space is the default.
+            -- <possibly adjust [[sep]] and [[ssep]] according to token position and size>=
+            if find(sep, sep_char) then
+              -- do nothing; sep is OK
+            elseif i == lim-1 then
+              sep, ssep = '~', '~'
+            elseif i == start + 1 then
+              sep  = text_char_count(s)  < 3 and '~' or ' '
+              ssep = text_char_count(ss) < 3 and '~' or ' '
+            else
+              sep, ssep = ' ', ' '
+            end
+            ss = ss ..        ssep .. nnext
+            s  = s  .. '.' .. sep  .. next
+          end
+        end
+        name[long] = ss
+        name[short] = s
+      end
+    end
+    set_name(first_start, first_lim, 'ff', 'f')
+    set_name(von_start,   von_lim,   'vv', 'v')
+    set_name(von_lim,     last_lim,  'll', 'l')
+    set_name(last_lim,    jr_lim,    'jj', 'j')
+    return name
+  end
+end
+
+Bibliography = { -- This is big enough to have its own global var
+  CitationStyles = {
+    AuthorYear = function(_ENV)
+      return andSurnames(3), " ", year, optional(", ", cite.page)
+    end
+  },
+
+  produceCitation = function (cite, bib, style)
+    local item = bib[cite.key]
+    if not item then
+      return Bibliography.Errors.UNKNOWN_REFERENCE
+    end
+    local t = Bibliography.buildEnv(cite, item.attributes, style)
+    return Bibliography._process(item.attributes, {style.CitationStyle(t)})
+  end,
+
+  produceReference = function (cite, bib, style)
+    local item = bib[cite.key]
+    if not item then
+      return Bibliography.Errors.UNKNOWN_REFERENCE
+    end
+    item.type =  (item.type:gsub("^%l", string.upper))
+    if not style[item.type] then
+      return Bibliography.Errors.UNKNOWN_TYPE
+    end
+
+    local t = Bibliography.buildEnv(cite, item.attributes, style)
+    return Bibliography._process(item.attributes, {style[item.type](t)})  
+  end,
+
+  buildEnv = function (cite,item, style)
+    local t = std.table.clone(_ENV)
+    t.cite = cite
+    t.item = item
+    for k,v in pairs(item) do t[k:lower()] = v end
+    return std.table.merge(t, style)
+  end,
+
+  _process = function (item, t, dStart, dEnd)
+    for i = 1,#t do 
+      if type(t[i]) == "function" then
+        t[i] = t[i](item)
+      end
+    end
+    local res = table.concat(t,"")
+    if dStart or dEnd then
+      if res ~= "" then return (dStart .. res .. dEnd) end
+    else
+      return res
+    end
+  end,
+
+  Errors = {
+    UNKNOWN_REFERENCE = 1,
+  },
+
+  Style = std.object {
+    andAuthors = function(item)
+      local authors = namesplit(item.Author)
+      if #authors == 1 then
+        return parse_name(authors[1]).ll
+      else
+        for i = 1,#authors do 
+          local author = parse_name(authors[i])
+          authors[i] = author.ll.. ", "..author.f.."."
+        end
+        return table.concat(authors, " and ")
+      end
+    end,
+
+    andSurnames = function (max)
+      return function(item)
+        local authors = namesplit(item.Author)
+        if #authors > max then
+          return parse_name(authors[1]).ll .. " et al."
+        else
+          for i = 1,#authors do authors[i] = parse_name(authors[i]).ll end
+          return Bibliography.Style.commafy(authors)
+        end
+      end
+    end,
+
+    transEditor = function(item) 
+      local r = {}
+      if item.Editor then r[#r+1] = "Edited by "..item.Editor end
+      if item.Translator then r[#r+1] = "Translated by "..item.Translator end
+      if #r then return table.concat(r, ", ") end
+      return nil
+    end,
+    quotes = function (...)
+      local t = {...}
+      return function(item)
+        return Bibliography._process(item, t, "“", "”")
+      end
+    end,
+    italic = function (...)
+      local t = {...}
+      return function(item)
+        return Bibliography._process(item, t, "\\em{", "}")
+      end
+    end,
+    parens = function (...)
+      local t = {...}
+      return function(item)
+        return Bibliography._process(item, t, "(", ")")
+      end
+    end,
+    optional = function(...)
+      local t = {n=select('#', ...), ...}
+      return function(item)
+        for i = 1,t.n do
+          if type(t[i]) == "function" then t[i] = t[i](item) end
+          if not t[i] or t[i] == "" then return "" end
+        end
+        return table.concat(t, "")
+      end
+    end,
+
+    -- The following functions borrowed from Norman Ramsey's nbibtex, 
+    -- with permission.
+    commafy = function (t, andword)
+      andword = andword or 'and'
+      if #t == 1 then
+        return t[1]
+      elseif #t == 2 then
+        return t[1] .. ' ' .. andword .. ' ' .. t[2]
+      else
+        local last = t[#t]
+        t[#t] = andword .. ' ' .. t[#t]
+        local answer = table.concat(t, ', ')
+        t[#t] = last
+        return answer
+      end
+    end
+  }
+}