Module:Headword: Difference between revisions

From The Languages of David J. Peterson
Jump to navigation Jump to search
mNo edit summary
m (Protected "Module:Headword": High traffic page ([Edit=Allow only administrators] (indefinite) [Move=Allow only administrators] (indefinite)))
 
(45 intermediate revisions by 3 users not shown)
Line 1: Line 1:
local export = {}
local export = {}
local m_string_utils = require("Module:string utilities")
local rfind = m_string_utils.find
local rgmatch = m_string_utils.gmatch
local rsubn = m_string_utils.gsub
local ulen = mw.ustring.len
local unfc = mw.ustring.toNFC


local m_data = mw.loadData("Module:headword/data")
local m_data = mw.loadData("Module:headword/data")
local title = mw.title.getCurrentTitle()


local isLemma = m_data.lemmas
local isLemma = m_data.lemmas
Line 10: Line 15:
local toBeTagged = m_data.toBeTagged
local toBeTagged = m_data.toBeTagged


local parameters = {
-- If set to true, categories always appear, even in non-mainspace pages
lang = { type = "object" },
local test_force_categories = false
script = { type = "object" },
heads = { type = "table" },
translits = { type = "table" },
transcriptions = { type = "table" },
inflections = { type = "table" },
genders = { type = "table" },
categories = { type = "table" },
pos_category = { type = "string" },
sort_key = { type = "string" },
id = { type = "string" },
}


local function test_script(text, script_code)
-- Version of rsubn() that discards all but the first return value.
if type(text) == "string" and type(script_code) == "string" then
local function rsub(term, foo, bar)
local sc = require("Module:scripts").getByCode(script_code)
return (rsubn(term, foo, bar))
local characters
end
if sc then
 
characters = sc:getCharacters()
-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
end
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-- page text to [[Template:tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
local out
-- by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID]].
if characters then
--
text = mw.ustring.gsub(text, "%W", "")
-- If `code` (a language or script code) is given, an additional tracking page
out = mw.ustring.find(text, "[" .. characters .. "]")
-- [[Template:tracking/headword/TRACK_ID/CODE]] is linked to, and you can find all entries in the combination of
end
-- `track_id` and `code` by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID/CODE]]. This makes it
-- possible to isolate only the entries with a specific tracking property that are in a given language or script.
if out then
local function track(track_id, code)
return true
local tracking_page = "headword/" .. track_id
else
local m_debug_track = require("Module:debug/track")
return false
if code then
end
m_debug_track{tracking_page, tracking_page .. "/" .. code}
else
else
mw.log("Parameters to test_script were incorrect.")
m_debug_track(tracking_page)
return nil
end
end
return true
end
end




local function preprocess(data)
local function text_in_script(text, script_code)
--[=[
local sc = require("Module:scripts").getByCode(script_code)
[[Special:WhatLinksHere/Template:tracking/headword/heads-not-table]]
if not sc then
[[Special:WhatLinksHere/Template:tracking/headword/translits-not-table]]
error("Internal error: Bad script code " .. script_code)
]=]
if type(data.heads) ~= "table" then
if data.heads then
require("Module:debug").track("headword/heads-not-table")
end
data.heads = { data.heads }
end
end
local characters = sc:getCharacters()
if type(data.translits) ~= "table" then
 
if data.translits then
local out
require("Module:debug").track("headword/translits-not-table")
if characters then
end
text = rsub(text, "%W", "")
out = rfind(text, "[" .. characters .. "]")
data.translits = { data.translits }
end
end
 
if type(data.transcriptions) ~= "table" then
if out then
if data.transcriptions then
return true
require("Module:debug").track("headword/transcriptions-not-table")
else
end
return false
data.transcriptions = { data.transcriptions }
end
end
end
if not data.heads or #data.heads == 0 then
 
data.heads = {""}
 
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:"
local notWordPunc = "[^" .. wordPunc .. "]+"
 
 
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- * right qualifiers in `qq`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
--  (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
local left_qualifiers, right_qualifiers
local reftext
 
left_qualifiers = part.q and #part.q > 0 and part.q
if left_qualifiers then
left_qualifiers = require("Module:qualifier").format_qualifier(left_qualifiers) .. " "
end
end
 
-- Determine if term is reconstructed
right_qualifiers = part.qq and #part.qq > 0 and part.qq
local is_reconstructed = data.lang:getType() == "reconstructed"
if right_qualifiers then
or title.nsText == "Reconstruction"
right_qualifiers = " " .. require("Module:qualifier").format_qualifier(right_qualifiers)
-- Create a default headword.
local subpagename = title.subpageText
local pagename = title.text
local default_head
if is_reconstructed then
default_head = require("Module:utilities").plain_gsub(pagename, data.lang:getCanonicalName() .. "/", "")
else
default_head = subpagename
end
end
if part.refs and #part.refs > 0 then
-- Add links to multi-word page names when appropriate
local refs = {}
if data.lang:getCode() ~= "zh" then
for _, ref in ipairs(part.refs) do
local spacingPunctuation = "[%s%p]+"
if type(ref) ~= "table" then
--[[ List of punctuation or spacing characters that are found inside of words.
ref = {text = ref}
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་"
local notWordPunc = "[^" .. wordPunc .. "]+"
local contains_words = false
for possibleWordBreak in mw.ustring.gmatch(default_head, spacingPunctuation) do
if mw.ustring.find(possibleWordBreak, notWordPunc) then
contains_words = true
break
end
end
end
local refargs
if ref.name or ref.group then
if (not is_reconstructed) and contains_words then
refargs = {name = ref.name, group = ref.group}
local function workaround_to_exclude_chars(s)
return mw.ustring.gsub(s, notWordPunc, "]]%1[[")
end
end
table.insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
default_head = "[["
.. mw.ustring.gsub(
default_head,
spacingPunctuation,
workaround_to_exclude_chars
)
.. "]]"
--
--[=[use this when workaround is no longer needed:
default_head = "[["
.. mw.ustring.gsub(default_head, WORDBREAKCHARS, "]]%1[[")
.. "]]"
--[=[
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
default_head = mw.ustring.gsub(default_head, "%[%[%]%]", "")
end
end
reftext = table.concat(refs)
end
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
if left_qualifiers then
formatted = left_qualifiers .. formatted
end
end
if reftext then
if is_reconstructed then
formatted = formatted .. reftext
default_head = "*" .. default_head
end
if right_qualifiers then
formatted = formatted .. right_qualifiers
end
if separator then
formatted = separator .. formatted
end
end
 
-- If a head is the empty string "", then replace it with the default
return formatted
for i, head in ipairs(data.heads) do
end
if head == "" then
 
head = default_head
 
else
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
if head == default_head and data.lang:getCanonicalName() == "English" then
function export.head_is_multiword(head)
table.insert(data.categories, data.lang:getCanonicalName() .. " terms with redundant head parameter")
for possibleWordBreak in rgmatch(head, spacingPunctuation) do
end
if rfind(possibleWordBreak, notWordPunc) then
return true
end
end
data.heads[i] = head
end
end
 
--[[ Try to detect the script if it was not provided
return false
We use the first headword for this, and assume
end
that all of them have the same script
 
This *should* always be true, right? ]]
 
if not data.sc then
--[==[Add links to a multiword head.]==]
data.sc = require("Module:scripts").findBestScript(data.heads[1], data.lang)
function export.add_multiword_links(head, default)
local function workaround_to_exclude_chars(s)
return rsub(s, notWordPunc, "\2%1\1")
end
end
 
for i, val in pairs(data.translits) do
head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
data.translits[i] = {display = val, is_manual = true}
if default then
head = head
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
end
end
-- Make transliterations
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
for i, head in ipairs(data.heads) do
head = require("Module:utilities").make_entities(head, "%[%]")
local translit = data.translits[i]
-- Try to generate a transliteration if necessary
-- Generate it if the script is not Latn or similar, and if no transliteration was provided
if translit and translit.display == "-" then
translit = nil
elseif not translit and not (data.sc:getCode():find("Latn", nil, true) or data.sc:getCode() == "Latinx" or data.sc:getCode() == "None") and (not data.sc or data.sc:getCode() ~= "Imag") then
translit = data.lang:transliterate(require("Module:links").remove_links(head), data.sc)
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not translit and not notranslit[data.lang:getCode()] then
translit = "<small>transliteration needed</small>"
table.insert(data.categories, "Requests for transliteration of " .. data.lang:getCanonicalName() .. " terms")
end
if translit then
translit = {display = translit, is_manual = false}
end
end
-- Link to the transliteration entry for languages that require this
if translit and data.lang:link_tr() then
translit.display = require("Module:links").full_link{
term = translit.display,
lang = data.lang,
sc = require("Module:scripts").getByCode("Latn"),
tr = "-"
}
end
data.translits[i] = translit
end
if data.id and type(data.id) ~= "string" then
--[=[
error("The id in the data table should be a string.")
use this when workaround is no longer needed:
end
 
head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
 
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
return (head
:gsub("\1\2", "")
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end
 
 
local function non_categorizable(data)
return data.title.text:find("Main Page") or data.title:inNamespace("Appendix") and data.title.text:find("^Gestures/")  
end
end




-- Format a headword with transliterations
-- Format a headword with transliterations.
local function format_headword(data)
local function format_headword(data)
local m_links = require("Module:links")
local m_scriptutils = require("Module:script utilities")
local m_scriptutils = require("Module:script utilities")
 
-- Are there non-empty transliterations?
-- Are there non-empty transliterations?
-- Need to do it this way because translit[1] might be nil while translit[2] is not
local has_translits = false
local has_translits = false
local has_manual_translits = false
-- Format the headwords
 
for i, head in ipairs(data.heads) do
------ Format the headwords. ------
if data.translits[i] or data.transcriptions[i] then
 
local head_parts = {}
local unique_head_parts = {}
 
local has_multiple_heads = #data.heads > 1
 
for j, head in ipairs(data.heads) do
if head.tr or head.ts then
has_translits = true
has_translits = true
end
end
if head.tr and head.tr_manual or head.ts then
-- Apply processing to the headword, for formatting links and such
has_manual_translits = true
if head:find("[[", nil, true) and (not data.sc or data.sc:getCode() ~= "Imag") then
head = m_links.language_link({term = head, lang = data.lang}, false)
end
end
 
-- Add language and script wrapper
local formatted
if i == 1 then
 
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil, data.id)
-- Apply processing to the headword, for formatting links and such.
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Imag" then
formatted = require("Module:links").language_link({term = head.term, lang = data.lang}, false)
else
else
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil)
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
end
end
 
data.heads[i] = head
local function tag_text_and_add_quals_and_refs(head, formatted, j)
end
-- Add language and script wrapper.
formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
local translits_formatted = ""
-- Add qualifiers, references and separator.
return format_term_with_qualifiers_and_refs(head, formatted, j)
if has_translits then
end
-- Format the transliterations
 
local translits = data.translits
local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
local transcriptions = data.transcriptions
table.insert(head_parts, head_part)
 
if translits then
-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
-- using pairs() instead of ipairs() in case there is a gap
-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
for i, _ in pairs(translits) do
if has_multiple_heads then
if type(i) == "number" then
local unique_head_part
translits[i] = m_scriptutils.tag_translit(translits[i].display, data.lang:getCode(), "head", nil, translits[i].is_manual)
if j == 1 then
end
unique_head_part = head_part
else
unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
end
end
unique_head_parts[unique_head_part] = true
end
end
end
local set_size = 0
if has_multiple_heads then
for k, _ in pairs(unique_head_parts) do
set_size = set_size + 1
end
end
if set_size == 1 then
head_parts = head_parts[1]
else
head_parts = table.concat(head_parts)
end
if has_manual_translits then
-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit]]
-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit/LANGCODE]]
track("has-manual-translit", data.lang:getCode())
end
------ Format the transliterations and transcriptions. ------
local translits_formatted


if transcriptions then
if has_translits then
for i, _ in pairs(transcriptions) do
local translit_parts = {}
if type(i) == "number" then
for i, head in ipairs(data.heads) do
transcriptions[i] = m_scriptutils.tag_transcription(transcriptions[i], data.lang:getCode(), "head")
if head.tr or head.ts then
local this_parts = {}
if head.tr then
table.insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
if head.ts then
table.insert(this_parts, " ")
end
end
end
if head.ts then
table.insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
end
table.insert(translit_parts, table.concat(this_parts))
end
end
end
end


for i = 1, math.max(#translits, #transcriptions) do
translits_formatted = " (" .. table.concat(translit_parts, " <i>or</i> ") .. ")"
local translits_formatted = {}
 
table.insert(translits_formatted, translits[i] and translits[i] or "")
local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Project")
table.insert(translits_formatted, (translits[i] and transcriptions[i]) and " " or "")
 
table.insert(translits_formatted, transcriptions[i] and "/" .. transcriptions[i] .. "/" or "")
data.translits[i] = table.concat(translits_formatted)
end
translits_formatted = " (" .. table.concat(data.translits, " <i>or</i> ") .. ")"
local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Wiktionary")
if transliteration_page then
if transliteration_page then
local success, exists = pcall(function () return transliteration_page.exists end)
local success, exists = pcall(function () return transliteration_page.exists end)
if success and exists then
if success and exists then
translits_formatted = " [[Wiktionary:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[Project:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
end
end
end
end
else
translits_formatted = ""
end
end
 
return table.concat(data.heads, " <i>or</i> ") .. translits_formatted
------ Paste heads and transliterations/transcriptions. ------
 
return head_parts .. translits_formatted
end
end


Line 288: Line 292:
local function format_genders(data)
local function format_genders(data)
if data.genders and #data.genders > 0 then
if data.genders and #data.genders > 0 then
local gen = require("Module:gender and number")
local pos_for_cat
return "&nbsp;" .. gen.format_list(data.genders, data.lang)
if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] then
local pos_category = data.pos_category:gsub("^reconstructed ", "")
pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
end
local text, cats = require("Module:gender and number").format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
table.insert(data.categories, cat)
end
return "&nbsp;" .. text
else
else
return ""
return ""
Line 297: Line 309:


local function format_inflection_parts(data, parts)
local function format_inflection_parts(data, parts)
local m_links = require("Module:links")
local any_part_translit = false
 
for key, part in ipairs(parts) do
for j, part in ipairs(parts) do
if type(part) ~= "table" then
if type(part) ~= "table" then
part = {term = part}
part = {term = part}
end
end
 
local qualifiers = ""
if part.qualifiers and #part.qualifiers > 0 then
qualifiers = require("Module:qualifier").format_qualifier(part.qualifiers) .. " "
-- [[Special:WhatLinksHere/Template:tracking/headword/qualifier]]
require("Module:debug").track("headword/qualifier")
end
local partaccel = part.accel
local partaccel = part.accel
local face = part.hypothetical and "hypothetical" or "bold"
local face = part.hypothetical and "hypothetical" or "bold"
local nolink = part.hypothetical or part.nolink
 
-- Here the final part 'or data.nolink' allows to have 'nolink=true'
-- Convert the term into a full link
-- right into the 'data' table to disable links of the entire headword
-- Don't show a transliteration here, the consensus seems to be not to
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-- show them in headword lines to avoid clutter.
local nolink = part.hypothetical or part.nolink or data.nolink
part = m_links.full_link(
 
{
local formatted
term = not nolink and part.term or nil,
if part.label then
alt = part.alt or (nolink and part.term or nil),
-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
lang = part.lang or data.lang,
formatted = "<i>" .. part.label .. "</i>"
sc = part.sc or parts.sc or (not part.lang and data.sc),
else
id = part.id,
-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
genders = part.genders,
-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil),
-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
ts = part.transcription,
-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
accel = parts.accel or partaccel,
-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
},
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
face,
-- to be enabled in languages with more complex scripts (e.g. Arabic).
false
local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
)
if tr ~= "-" then
any_part_translit = true
part = qualifiers .. part
end
formatted = require("Module:links").full_link(
parts[key] = part
{
term = not nolink and part.term or nil,
alt = part.alt or (nolink and part.term or nil),
lang = part.lang or data.lang,
-- FIXME, code smell in always using the first script.
sc = part.sc or parts.sc or (not part.lang and data.heads[1].sc),
id = part.id,
genders = part.genders,
tr = tr,
ts = part.transcription,
accel = partaccel or parts.accel,
},
face,
false
)
end
 
parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
end
end
 
local parts_output = ""
local parts_output
 
if #parts > 0 then
if #parts > 0 then
parts_output = " " .. table.concat(parts, " <i>or</i> ")
parts_output = (parts.label and " " or "") .. table.concat(parts)
elseif parts.request then
elseif parts.request then
parts_output = " <small>[please provide]</small>"
parts_output = " <small>[please provide]</small>"
.. require("Module:utilities").format_categories(
table.insert(data.categories, "Requests for inflections in " .. data.lang:getCanonicalName() .. " entries")
{"Requests for inflections in " .. data.lang:getCanonicalName() .. " entries"},
else
lang,
parts_output = ""
nil,
nil,
data.force_cat_output,
data.sc
)
end
end
return "<i>" .. parts.label .. "</i>" .. parts_output
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return parts_label .. parts_output, any_part_translit
end
end


-- Format the inflections following the headword
 
-- Format the inflections following the headword.
local function format_inflections(data)
local function format_inflections(data)
local any_part_translit = false
if data.inflections and #data.inflections > 0 then
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually
-- Format each inflection individually.
for key, infl in ipairs(data.inflections) do
for key, infl in ipairs(data.inflections) do
data.inflections[key] = format_inflection_parts(data, infl)
local this_any_part_translit
data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
if this_any_part_translit then
any_part_translit = true
end
end
end
 
return " (" .. table.concat(data.inflections, ", ") .. ")"
local concat_result = table.concat(data.inflections, ", ")
return " (" .. concat_result .. ")"
else
else
return ""
return ""
Line 374: Line 397:
end
end


local function show_headword_line(data)
--[==[
local namespace = title.nsText
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
-- Is it a lemma category?
if isLemma[plpos] then
return "lemma"
end
local plpos_no_recon = plpos:gsub("^reconstructed ", "")
if isLemma[plpos_no_recon] then
return "lemma"
end
-- Is it a nonlemma category?
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
end
local plpos_no_mut = plpos:gsub("^mutated ", "")
if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
return "non-lemma form"
elseif best_guess then
return plpos:find(" forms$") and "non-lemma form" or "lemma"
else
return nil
end
end


-- Check the namespace against the language type
 
if namespace == "" then
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
if data.lang:getType() == "reconstructed" then
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
error("Entries for this language must be placed in the Reconstruction: namespace.")
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
elseif data.lang:getType() == "appendix-constructed" then
-- no blank (zero-length) strings are present.
error("Entries for this language must be placed in the Appendix: namespace.")
local function init_and_find_maximum_index(data, element, allow_blank_string)
local maxind = 0
if not data[element] then
data[element] = {}
end
local typ = type(data[element])
if typ ~= "table" then
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
end
for k, v in pairs(data[element]) do
if k ~= "maxindex" then
if type(k) ~= "number" then
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, name))
end
if k > maxind then
maxind = k
end
if v then
if type(v) ~= "string" then
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
if not allow_blank_string and v == "" then
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
end
end
return maxind
end
--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
--
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
function export.maintenance_cats(m_data, lang, lang_cats, page_cats)
if m_data.wikitext_topic_cat[lang:getCode()] then
table.insert(lang_cats, lang:getCanonicalName() .. " entries with topic categories using raw markup")
end
if m_data.wikitext_langname_cat[lang:getCanonicalName()] then
table.insert(lang_cats, lang:getCanonicalName() .. " entries with language name categories using raw markup")
end
if m_data.unsupported_title then
table.insert(page_cats, "Unsupported titles")
end
if m_data.pagename_defaultsort_conflict then
table.insert(page_cats, m_data.pagename_defaultsort_conflict)
end
if m_data.pagename_displaytitle_conflict then
table.insert(page_cats, m_data.pagename_displaytitle_conflict)
end
end
--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
local remove_links = require("Module:links").remove_links
local format_categories = require("Module:utilities").format_categories
-- Prevent data from being destructively modified.
local data = require("Module:table").shallowcopy(data)
local tracking_categories = {}
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
 
if data.getCanonicalName then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end
 
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
 
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
end
 
------------ 2. Initialize pagename etc. ------------


if not data.noposcat then
local langcode = data.lang:getCode()
local pos_category = data.lang:getCanonicalName() .. " " .. data.pos_category
local langname = data.lang:getCanonicalName()
if pos_category ~= "Translingual Han characters" then
 
table.insert(data.categories, 1, pos_category)
if data.pagename then -- for testing, doc pages, etc.
data.title = mw.title.new(data.pagename)
if not data.title then
error(("Bad value for `data.pagename`: '%s'"):format(data.pagename))
end
end
else
data.title = mw.title.getCurrentTitle()
end
end
 
-- Is it a lemma category?
local pagename = data.title.text
if isLemma[data.pos_category] or isLemma[data.pos_category:gsub("^reconstructed ", "")] then
local namespace = data.title.nsText
if not data.noposcat then
 
table.insert(data.categories, 1, data.lang:getCanonicalName() .. " lemmas")
-- Check the namespace against the language type.
if namespace == "" then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
end
-- Is it a nonlemma category?
end
elseif isNonLemma[data.pos_category]
 
or isNonLemma[data.pos_category:gsub("^reconstructed ", "")]
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
or isLemma[data.pos_category:gsub("^mutated ", "")]
 
or isNonLemma[data.pos_category:gsub("^mutated ", "")] then
if type(data.heads) == "table" and type(data.heads[1]) == "table" then
-- new-style
if not data.noposcat then
if data.translits or data.transcriptions then
table.insert(data.categories, 1, data.lang:getCanonicalName() .. " non-lemma forms")
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
end
-- It's neither; we don't know what this category is, so tag it with a tracking category.
else
else
--[=[
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
[[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos]]
local maxind = math.max(
]=]
init_and_find_maximum_index(data, "heads",true),
table.insert(tracking_categories, "head tracking/unrecognized pos")
init_and_find_maximum_index(data, "translits", true),
require("Module:debug").track{
init_and_find_maximum_index(data, "transcriptions", true)
"headword/unrecognized pos",
)
"headword/unrecognized pos/lang/" .. data.lang:getCode(),
for i = 1, maxind do
"headword/unrecognized pos/pos/" .. data.pos_category
data.heads[i] = {
}
term = data.heads[i],
tr = data.translits[i],
ts = data.transcriptions[i],
}
end
end
end
 
-- Preprocess
-- Make sure there's at least one head.
preprocess(data)
if not data.heads[1] then
data.heads[1] = {}
local m_links = require("Module:links")
end
 
if namespace == "" and data.lang:getType() ~= "reconstructed" then
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
for _, head in ipairs(data.heads) do
 
if title.prefixedText ~= m_links.getLinkPage(m_links.remove_links(head), data.lang) then
init_and_find_maximum_index(data, "categories")
--[=[
init_and_find_maximum_index(data, "whole_page_categories")
[[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch]]
local pos_category_already_present = false
]=]
if #data.categories > 0 then
require("Module:debug").track{
local escaped_langname = require("Module:pattern utilities").pattern_escape(langname)
"headword/pagename spelling mismatch",
local matches_lang_pattern = "^" .. escaped_langname .. " "
"headword/pagename spelling mismatch/" .. data.lang:getCode()
for _, cat in ipairs(data.categories) do
}
-- Does the category begin with the language name? If not, tag it with a tracking category.
break
if not cat:find(matches_lang_pattern) then
-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category]]
-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category/LANGCODE]]
track("no lang category", langcode)
end
end
end
-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
-- throw an error below.
if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
-- Optimization to avoid inserting category already present.
pos_category_already_present = true
end
end
end
end
 
-- Format and return all the gathered information
if not data.pos_category then
return
error("`data.pos_category` not specified and could not be inferred from the categories given in "
format_headword(data) ..
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
format_genders(data) ..
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
format_inflections(data) ..
.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
require("Module:utilities").format_categories(
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
)
)
end
end
 
-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
if not pos_category_already_present and not data.noposcat then
local pos_category = langname .. " " .. data.pos_category
-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
if pos_category ~= "Translingual Han characters" then
table.insert(data.categories, 1, pos_category)
end
end
 
-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not postype then
-- We don't know what this category is, so tag it with a tracking category.
-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos]]
-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/LANGCODE]]
track("unrecognized pos", langcode)
-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS]]
-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS/LANGCODE]]
track("unrecognized pos/pos/" .. data.pos_category, langcode)
elseif not data.noposcat then
table.insert(data.categories, 1, langname .. " " .. postype .. "s")
end
 
------------ 5. Create a default headword, and add links to multiword page names. ------------
 
-- Determine if term is reconstructed
local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")
 
-- Create a default headword based on the pagename, which is determined in
-- advance by the data module so that it only needs to be done once.
local default_head = mw.ustring.lower(m_data.pagename)
local unmodified_default_head = default_head
 
-- Add links to multi-word page names when appropriate
if not m_data.no_multiword_links[langcode] and not is_reconstructed and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
end


function export.full_headword(data)
if is_reconstructed then
local tracking_categories = {}
default_head = "*" .. default_head
-- Script-tags the topmost header.
local pagename = title.text
local fullPagename = title.fullText
local namespace = title.nsText
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In data, the first argument to full_headword, data.lang should be a language object.")
end
end
 
if not data.sc then
------------ 6. Fill in missing values in `data.heads`. ------------
data.sc = require("Module:scripts").findBestScript(data.heads and data.heads[1] ~= "" and data.heads[1] or pagename, data.lang)
 
else
-- True if any script among the headword scripts has spaces in it.
-- Track uses of sc parameter
local any_script_has_spaces = false
local best = require("Module:scripts").findBestScript(pagename, data.lang)
-- True if any term has a redundant head= param.
require("Module:debug").track("headword/sc")
local has_redundant_head_param = false
 
if data.sc:getCode() == best:getCode() then
for _, head in ipairs(data.heads) do
require("Module:debug").track("headword/sc/redundant")
 
require("Module:debug").track("headword/sc/redundant/" .. data.sc:getCode())
------ 6a. If missing head, replace with default head.
else
if not head.term then
require("Module:debug").track("headword/sc/needed")
head.term = default_head
require("Module:debug").track("headword/sc/needed/" .. data.sc:getCode())
elseif head.term == default_head then
has_redundant_head_param = true
end
 
------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------    otherwise fall back to the overall script if given. If neither given, autodetect the script.
 
if not head.sc then
if data.sc then
-- Overall script given.
head.sc = data.sc
else
-- Autodetect script.
head.sc = data.lang:findBestScript(head.term)
end
end
 
-- If using a discouraged character sequence, add to maintenance category.
if head.sc:hasNormalizationFixes() == true then
local composed_head = unfc(head.term)
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
table.insert(data.whole_page_categories, "Pages using discouraged character sequences")
end
end
 
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
 
------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
------    (provided automatic translit is available, e.g. not in Persian or Hebrew).
 
-- Make transliterations
head.tr_manual = nil
 
-- Try to generate a transliteration if necessary
if head.tr == "-" then
head.tr = nil
elseif not notranslit[langcode] and head.sc:isTransliterated() then
head.tr_manual = not not head.tr
local text = head.term
if not data.lang:link_tr() then
text = remove_links(text)
end
local automated_tr, tr_categories
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
if automated_tr or head.tr_fail then
local manual_tr = head.tr
 
if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
table.insert(data.categories, "Terms with redundant transliterations")
table.insert(data.categories, "Terms with redundant transliterations/" .. langcode)
elseif not head.tr_fail then
table.insert(data.categories, "Terms with manual transliterations different from the automated ones")
table.insert(data.categories, "Terms with manual transliterations different from the automated ones/" .. langcode)
end
end
 
if not manual_tr then
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
table.insert(data.categories, category)
end
end
end
 
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not head.tr then
head.tr = "<small>transliteration needed</small>"
table.insert(data.categories, "Requests for transliteration of " .. langname .. " terms")
else
-- Otherwise, trim it.
head.tr = mw.text.trim(head.tr)
end
end
 
-- Link to the transliteration entry for languages that require this.
if head.tr and data.lang:link_tr() then
head.tr = require("Module:links").full_link {
term = head.tr,
lang = data.lang,
sc = require("Module:scripts").getByCode("Latn"),
tr = "-"
}
end
end
end
end
 
local displayTitle
------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
 
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
if namespace == "" and data.sc and toBeTagged[data.sc:getCode()] or
-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
data.sc:getCode() == "Jpan" and (test_script(pagename, "Hira") or test_script(pagename, "Kana")) then
-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
displayTitle = '<span class="' .. data.sc:getCode() .. '">' .. pagename .. '</span>'
-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
local display_title
-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
local dt_script = data.heads[1].sc
local dt_script_code = dt_script:getCode()
local page_non_ascii = namespace == "" and not pagename:find("^[%z\1-\127]+$")
local unsupported_pagename, unsupported = pagename:gsub("^Unsupported titles/", "")
if unsupported == 1 and m_data.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. m_data.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(pagename, "Hira") or text_in_script(pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(pagename, "Hang")) then
display_title = '<span class="' .. dt_script_code .. '">' .. pagename .. '</span>'
-- Keep Han entries region-neutral in the display title.
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
display_title = '<span class="Hani">' .. pagename .. '</span>'
elseif namespace == "Reconstruction" then
elseif namespace == "Reconstruction" then
displayTitle, matched = mw.ustring.gsub(
display_title, matched = rsubn(
fullPagename,
data.title.fullText,
"^(Reconstruction:[^/]+/)(.+)$",
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
function(before, term)
Line 494: Line 783:
term,
term,
data.lang,
data.lang,
data.sc
dt_script
)
)
end
end
)
)
if matched == 0 then
if matched == 0 then
displayTitle = nil
display_title = nil
end
end
end
end
if displayTitle then
if display_title then
local frame = mw.getCurrentFrame()
mw.getCurrentFrame():callParserFunction(
frame:callParserFunction(
"DISPLAYTITLE",
"DISPLAYTITLE",
displayTitle
display_title
)
)
end
end
------------ 8. Insert additional categories. ------------
if data.force_cat_output then
if data.force_cat_output then
--[=[
-- [[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
[[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
track("force cat output")
]=]
require("Module:debug").track("headword/force cat output")
end
end
 
if data.getCanonicalName then
if has_redundant_head_param then
error('The "data" variable supplied to "full_headword" should not be a language object.')
if not data.no_redundant_head_cat then
table.insert(data.categories, langname .. " terms with redundant head parameter")
end
end
end
 
-- Were any categories specified?
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if data.categories and #data.categories > 0 then
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and not m_data.no_multiword_cat[langcode] then
local lang_name = require("Module:string").pattern_escape(data.lang:getCanonicalName())
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
for _, cat in ipairs(data.categories) do
-- Use the pagename, not the head= value, because the latter may have extra
-- Does the category begin with the language name? If not, tag it with a tracking category.
-- junk in it, e.g. superscripted text that throws off the algorithm.
if not mw.ustring.find(cat, "^" .. lang_name) then
local checkpattern = ".[%s%-፡]."
mw.log(cat, data.lang:getCanonicalName())
if m_data.hyphen_not_multiword_sep[langcode] then
table.insert(tracking_categories, "head tracking/no lang category")
-- Exclude hyphens if the data module states that they should for this language
checkpattern = ".[%s፡]."
--[=[
[[Special:WhatLinksHere/Template:tracking/head tracking/no lang category]]
]=]
require("Module:debug").track{
"headword/no lang category",
"headword/no lang category/lang/" .. data.lang:getCode()
}
end
end
end
if rfind(unmodified_default_head, checkpattern) and not non_categorizable(data) then
if not data.pos_category
table.insert(data.categories, langname .. " multiword terms")
and mw.ustring.find(data.categories[1], "^" .. data.lang:getCanonicalName())
then
data.pos_category = mw.ustring.gsub(data.categories[1], "^" .. data.lang:getCanonicalName() .. " ", "")
table.remove(data.categories, 1)
end
end
end
end
 
if not data.pos_category then
if data.sccat then
error(
for _, head in ipairs(data.heads) do
'No valid part-of-speech categories were found in the list '
table.insert(data.categories, langname .. " " .. data.pos_category .. " in " .. head.sc:getDisplayForm())
.. 'of categories passed to the function "full_headword". '
end
.. 'The part-of-speech category should consist of a language\'s '
.. 'canonical name plus a part of speech.'
)
end
end
 
-- Categorise for unusual characters
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
local standard = data.lang:getStandardCharacters()
local standard = data.lang:getStandardCharacters()
if standard and not non_categorizable(data) then
if standard then
local function char_category(char)
if mw.ustring.len(title.subpageText) ~= 1 and not mw.ustring.match(title.text, "^Unsupported titles/") then
local specials = {["#"] = "number sign", ["<"] = "less-than sign", [">"] = "greater-than sign", ["["] = "left square bracket", ["]"] = "right square bracket", ["_"] = "underscore", ["{"] = "left curly bracket", ["|"] = "vertical line", ["}"] = "right curly bracket", ["ß"] = "ẞ", ["ͅ"] = "ͅ", ["\239\191\189"] = "replacement character"}
for character in mw.ustring.gmatch(title.subpageText, "([^" .. standard .. "])") do
char = mw.ustring.toNFD(char)
local upper = mw.ustring.upper(character)
:gsub("[%z\1-\127\194-\244][\128-\191]*", function(m)
if not mw.ustring.find(upper, "[" .. standard .. "]") then
local new_m = specials[m]
new_m = new_m or m:uupper()
return new_m
end)
return mw.ustring.toNFC(char)
end
if data.lang:getCode() ~= "lo" then
local standard_chars_scripts = {}
for _, head in ipairs(data.heads) do
standard_chars_scripts[head.sc:getCode()] = true
end
-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
for code in pairs(standard_chars_scripts) do
local sc_standard = data.lang:getStandardCharacters(code)
if sc_standard then
if m_data.pagename_len > 1 then
local explode_standard = {}
local function explode(char)
explode_standard[char] = true
return ""
end
local sc_standard = rsub(sc_standard, m_data.comb_chars.combined_double, explode)
sc_standard = rsub(sc_standard, m_data.comb_chars.combined_single, explode)
:gsub("[%z\1-\127\194-\244][\128-\191]*", explode)
local num
for char in pairs(m_data.explode_pagename) do
if not explode_standard[char] then
if char:find("[0-9]") then
if not num then
table.insert(data.categories, langname .. " terms spelled with numbers")
end
else
local upper = char_category(char)
if not explode_standard[upper] then
char = upper
end
table.insert(data.categories, langname .. " terms spelled with " .. char)
end
end
end
end
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
sc_standard = mw.ustring.toNFD(sc_standard)
for diacritic in rgmatch(m_data.decompose_pagename, m_data.comb_chars.diacritics_single) do
if not mw.ustring.find(sc_standard, diacritic) then
table.insert(data.categories, langname .. " terms spelled with ◌" .. diacritic)
end
end
for diacritic in rgmatch(m_data.decompose_pagename, m_data.comb_chars.diacritics_double) do
if not mw.ustring.find(sc_standard, diacritic) then
table.insert(data.categories, langname .. " terms spelled with ◌" .. diacritic .. "◌")
end
end
end
end
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
elseif ulen(m_data.pagename) ~= 1 then
for character in rgmatch(m_data.pagename, "([^" .. standard .. "])") do
local upper = char_category(character)
if not rfind(upper, "[" .. standard .. "]") then
character = upper
character = upper
end
end
table.insert(
table.insert(data.categories, langname .. " terms spelled with " .. character)
data.categories,
data.lang:getCanonicalName() .. " terms spelled with " .. character
)
end
end
end
end
end
end
 
-- Categorise for palindromes
-- Categorise for palindromes
if title.nsText ~= "Reconstruction"
if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(data.title.subpageText) > 2
and require('Module:palindromes').is_palindrome(
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
title.subpageText, data.lang, data.sc
-- multiple scripts?
) then
and require("Module:palindromes").is_palindrome(data.title.subpageText, data.lang, data.heads[1].sc) then
table.insert(data.categories, data.lang:getCanonicalName() .. " palindromes")
table.insert(data.categories, langname .. " palindromes")
end
 
if namespace == "" and not data.lang:hasType("reconstructed") then
local m_links = require("Module:links")
for _, head in ipairs(data.heads) do
if data.title.prefixedText ~= m_links.getLinkPage(remove_links(head.term), data.lang, head.sc) then
-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch]]
-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch/LANGCODE]]
track("pagename spelling mismatch", data.lang:getCode())
break
end
end
end
end
return
-- Add to various maintenance categories.
show_headword_line(data) ..
export.maintenance_cats(m_data, data.lang, data.categories, data.whole_page_categories)
require("Module:utilities").format_categories(
data.categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
------------ 9. Format and return headwords, genders, inflections and categories. ------------
) ..
 
require("Module:utilities").format_categories(
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
-- so make sure we do it before evaluating `data.categories`.
)
local text =
format_headword(data) ..
format_genders(data) ..
format_inflections(data)
-- Language-specific categories.
local cats = format_categories(
data.categories, data.lang, data.sort_key, m_data.encoded_pagename,
data.force_cat_output or test_force_categories, data.heads[1].sc
)
-- Language-agnostic categories.
local whole_page_cats = format_categories(
data.whole_page_categories, nil, "-"
)
return text .. cats .. whole_page_cats
end
end


return export
return export

Latest revision as of 01:29, 15 March 2024

This module is used to show headword lines, along with any annotations like genders, transliterations and inflections. It's used by the template {{head}}, via the submodule Module:headword/templates. It's also used by many other headword modules; for a full list, see Category:Headword-line modules. Some of the data used by this module is found in Module:headword/data.

export.head_is_multiword

function export.head_is_multiword(head)

Return true if the given head is multiword according to the algorithm used in full_headword().

export.add_multiword_links

function export.add_multiword_links(head, default)

Add links to a multiword head.

export.pos_lemma_or_nonlemma

function export.pos_lemma_or_nonlemma(plpos, best_guess)

-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil -- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.). -- If you have a POS in its singular form, call pluralize() in Module:string utilities to -- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'. -- -- If best_guess is given and the POS is in neither the lemma nor non-lemma list, guess -- based on whether it ends in " forms"; otherwise, return nil.

export.maintenance_cats

function export.maintenance_cats(m_data, lang, lang_cats, page_cats)

-- Add the page to various maintenance categories for the language and the -- whole page. These are placed in the headword somewhat arbitrarily, but -- mainly because headword templates are mandatory for entries (meaning that -- in theory it provides full coverage). -- -- This is provided as an external entry point so that modules which transclude -- information from other entries (such as {{ja-see}}) can take advantage -- of this feature as well, because they are used in place of a conventional -- headword template.

export.full_headword

function export.full_headword(data)

This is the primary external entry point. This is used by {{head}} and various language-specific headword templates (e.g. {{ru-adj}} for Russian adjectives, {{de-noun}} for German nouns, etc.) to display an entire headword line. See #Further explanations for full_headword()

Further explanations for full_headword()

The sole argument, data, is a table containing the following items (WARNING: they will be destructively modified):

{
	lang = language_object,
	pagename = nil or "pagename",
	heads = { "head1", "head2", "head3", ... } or {
		{
			term = nil or "head1",
			tr = nil or "translit1",
			ts = nil or "transcription1",
			sc = nil or script_object,
			q = nil or {"left_qualifier1", "left_qualifier2", ...},
			qq = nil or {"right_qualifier1", "right_qualifier2", ...},
			refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
			separator = nil or "separator",
		},
		...
	},
	translits = { [1] = "translit1", [3] = "translit3", ... },
	transcriptions = { [2] = "transcription2", [3] = "transcription3", ... },
	sc = script_object,
	inflections = {
		enable_auto_translit = boolean,
		{ label = "grammatical_category", "inflected_form1", "inflected_form2", ... },
		{ label = "grammatical_category", accel = {form = "tag|tag", lemma = "lémma"}, "inflected_form1", "inflected_form2", ... },
		{
			label = "grammatical_category",
			accel = {
				form = "tag|tag",
				target = "form_target",
				tr = nil or "form_manual_translit",
				gender = "gender_spec" or {"gender_spec1", "gender_spec2", ...},
				pos = "form_part_of_speech",
				lemma = nil or "lémma",
				lemma_translit = nil or "lemma_manual_translit",
				no_store = boolean,
			},
			sc = nil or inflection_specific_script_object,
			enable_auto_translit = boolean,
			"inflected_form1",
			{
				term = "inflected_form2",
				alt = nil or "display_text",
				translit = nil or "manual_transliteration",
				transcription = nil or "manual_transcription",
				gender = {"gender1", "gender2", {spec = "gender3", qualifiers = nil or {"qualifier1", "qualifier2", ... }}},
				accel = {form = "tag|tag|tag", lemma = "lemma_of_inflected_form", lemma_translit = "manual_translit" },
				lang = nil or term_specific_lang_object,
				sc = nil or term_specific_script_object,
				id = "sense_id",
				q = nil or {"left_qualifier1", "left_qualifier2", ... },
				qq = nil or {"right_qualifier1", "right_qualifier2", ... },
				refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
				separator = nil or "separator",
				nolink = boolean,
				hypothetical = boolean,
			},
			{
				label = "raw_textual_label",
				q = nil or {"left_qualifier1", "left_qualifier2", ... },
				qq = nil or {"right_qualifier1", "right_qualifier2", ... },
				refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
				separator = nil or "separator",
			},
			...
		},
		{ label = "grammatical_category", request = true },
		...
	},
	genders = {
		"gender1",
		{spec = "gender2", qualifiers = {"qualifier1", "qualifier2", ...}},
		...
	},
	pos_category = "plural_part_of_speech",
	categories = { "category1", "category2", ... },
	whole_page_categories = { "category1", "category2", ... },
	force_cat_output = boolean,
	sccat = boolean,
	noposcat = boolean,
	nogendercat = boolean,
	nomultiwordcat = boolean,
	nopalindromecat = boolean,
	nolink = boolean,
	sort_key = "sort_key",
	id = "sense_id",
}

Further explanation:

  • is required and is a language object from Module:languages corresponding to a given language. For example, use to retrieve the object corresponding to Russian.
    • is optional and allows you to override the pagename used variously in the module (e.g. as the default value when a head is omitted, for setting categories such as palindromes and terms spelled with CHAR, etc.).
      • is a table listing the heads of the headword. Each element is either a string specifying only the headword itself (old-style), or an object specifying all the properties of the headword (new-style). You cannot mix and match these two styles; all elements should be of one type or the other. If no heads are specified at all ( is omitted or is an empty array), a default head is set based on the assumed pagename (either the actual pagename or the value of , if set). When using old-style head strings, a given head in the array can be , in which case a default head is set as above. When head objects are used, a given object can have the following properties:
          • : A string specifying the headword. This can be omitted, in which case a default head is set as above. Explicit headwords are generally used to specify extra diacritics (in languages with such diacritics, e.g. Russian, Arabic, Latin, Ancient Greek, Old English, etc.), or to link individual words of a multiword term, particularly when the words are inflected forms. Note that by default, each word is linked individually to itself, so there is no need to specify links for a term like a golden key can open any door. Some additional notes:
                • If a headword string contains wikilinks, they are converted into language-section links for the given language (using Module:links#language_link, which is also used by {{l}}). For example, giving , if the language provided is English, will produce: . If string is prefixed with * or if any of the links are, then they are interpreted as reconstructed terms and it will create links to the Reconstruction namespace as appropriate.
                      • If the page name contains spaces or punctuation marks (except for punctuation marks that are used inside of words), it is split and each individual word is automatically wikilinked as above.
                      • If the current page is in the Reconstruction: namespace, then an asterisk will be prepended to the headword to indicate that it is a reconstructed term.
                          • : A string specifying the transliteration of the headword. This is only needed when the headword is in a non-Latin script, and even then only when the automatic transliteration specified using the language's transliteration module is incorrect (or the language has no transliteration module, such as with Persian and Hebrew). For languages with a transliteration module, pass in to suppress the transliteration entirely.
                              • : A string specifying the transcription of the headword. This is only used in a few languages with non-Latin scripts where the spelling is significantly different from the pronunciation, such as Akkadian, Old Persian or Hittite. In cases like this, the transliteration usually reflects the spelling and the transcription reflects the pronunciation. For this reason, transcriptions are displayed between slashes. Transcriptions should NOT be used simply to display IPA pronunciation of a language like Russian or Arabic. Unlike for transliterations, there are no automatic transcription modules.
                                  • : An optional script object from Module:scripts corresponding to a given script, specifying the script that the headword is in. If omitted, defaults to the top-level value. Most of the time, neither the per-headword script nor top-level script need to be specified: If both are omitted, Module:scripts will determine the script(s) using the list of scripts in the language's data file and the characters that are in the headword. Specifically, if there are multiple possible scripts for a language, the script with the largest number of characters in the headword is chosen.
                                      • : An optional array specifying one of more qualifiers displayed to the left of the headword. Qualifiers are displayed in italics and with parentheses around them, and are intended to specify relevant properties of the headword, especially when there is more than one headword.
                                          • : An optional array specifying one of more qualifiers displayed to the right of the headword, as above.
                                              • : An optional array specifying one of more references (i.e. footnotes) for the headword. This is similar to using <ref>...</ref> to specify a reference/footnote after a given word in the text. Each element of the array is either a string (the text of the reference) or an object of the form . In this latter format:
                                                    • specifies the reference text (which cannot be ; use a blank string when cross-referencing to another reference);
                                                          • gives an optional name to the reference for cross-reference purposes, if the reference text is non-empty, similarly to <ref name="ref_name">ref_text</ref>; however, if the reference text is empty, it specifies a cross-reference to a previously-named reference, similarly to <ref name="ref_name"/>;
                                                                • gives an optional group to the reference for grouping purposes, similarly to <ref name="ref_name" group="ref_group">ref_text</ref>; however, if the reference text is empty, it specifies the group of a cross-reference to a previously-named reference, similarly to <ref name="ref_name" group="ref_group"/>.
                                                                    • : The separator preceding the headword. If omitted, the default value is  <i>or</i>  (i.e. the italicized word or surrounded by spaces) for the second and higher headword, and a blank string for the first headword. Use a blank string to request no separator at all.
                                                                      • is an optional table listing the transliterations corresponding to each headword in , when old-style head strings are used; omitting this field is equivalent to setting it to an empty list. If new-style head objects are used, this field must be omitted. The Nth numbered entry should be either a string specifying the transliteration of headword N, or may be omitted, as with the property described above. Note that, if there are multiple headwords, the table in might have entries in the middle of the list that are . A list of this sort cannot be created with , as attempting to insert this way does nothing. Instead, each transliteration must be explicitly assigned using a number as index, e.g. ; here, item is , because no value was assigned to it.
                                                                        • is an optional table listing the transcriptions corresponding to each headword in , when old-style head strings are used; omitting this field is equivalent to setting it to an empty list. If new-style head objects are used, this field must be omitted. It is of the same format as , and can have holes in it as needed. The meaning of the transcription field is as described abobe for .
                                                                          • is an optional script object from Module:scripts corresponding to a given script. If specified, this applies equally to all heads specified using ; if you need to specify per-head scripts, use the head object format documented above. Most of the time you can omit this item, and Module:scripts will determine the script(s) as specified above for the headword property.
                                                                            • is a table listing the gender/number specifications for the headwords. This can be omitted for no genders or numbers. Each element is either a string specifying a gender/number spec, or a table of the form . In either case, the accepted values for genders or numbers are given in Module:gender and number; examples are for masculine, for feminine animate plural and for noun class 2 in languages such as Swahili that have noun classes. If the format with qualifiers is given, the qualifiers are displayed to the left of the gender/number specification. Categories are automatically added according to the specific genders, e.g. LANG masculine nouns for the language specified in if the gender is masculine and the part of speech (see below) is nouns or reconstructed nouns. To suppress the addition of these categories, specify .
                                                                              • is a table listing the inflections to be displayed in the headword entry. The format of this table is somewhat complex and is described below under format_inflections.
                                                                                • is the part-of-speech category for the entry. This is one of the lemma and nonlemma parts of speech listed in Module:headword/data. It should be in the plural: for example, . If this item is omitted, the part of speech category must be included in as the first item in .
                                                                                  • is a table listing the categories to which the entry containing the headword will be added. The first category should be a part-of-speech category, with the canonical name of the language at the beginning – – unless the part of speech is given in the field .
                                                                                    • is a table listing language-agnostic categories to which the page will be added, which it is nevertheless useful for the headword module to handle (e.g. Category:Unsupported titles). Because they are not tied to a language, pages in them should be sorted according to their {{DEFAULTSORT:}} values for the sake of consistency. Note that some of these - including "Category:Unsupported titles" - are already handled automatically.
                                                                                      • is a string specifying a sort key for the categories listed in . Sort keys should usually be omitted, because the format_categories function in Module:utilities will generate a suitable sortkey in most cases. The sortkey is used to ensure that the page is listed in the correct order in the categories to which it belongs.
                                                                                        • is a boolean value determining whether or not to link the forms of the entire headword. Not to be confused with , which disables linking only for one of the forms. It is used, for example, by Module:la-headword for reconstructed terms.

                                                                                          Examples

                                                                                          A simple example

                                                                                          full_headword{
                                                                                          	lang = require("Module:languages").getByCode("en"),	-- language code
                                                                                          	heads = {"book"},									-- headwords
                                                                                          	inflections = {
                                                                                          		{label = "plural", "books"}						-- inflections
                                                                                          	},
                                                                                          	categories = {"English nouns"},						-- part-of-speech category
                                                                                          }
                                                                                          

                                                                                          might give (depending on the page it's run on):

                                                                                          <strong class="Latn headword" lang="en">book</strong> (''plural'' <b class="Latn" lang="en">[[books#English|books]]</b>)[[Category:English lemmas|HEADWORD]][[Category:English nouns|HEADWORD]]
                                                                                          

                                                                                          which displays as:

                                                                                          book (plural books)

                                                                                          A fuller example

                                                                                          full_headword{
                                                                                          	lang = require("Module:languages").getByCode("de"),
                                                                                          	heads = {"Hund"},
                                                                                          	genders = {"m"},
                                                                                          	inflections = {
                                                                                          		{label = "genitive", "Hundes", "Hunds"},
                                                                                                  {label = "plural", "Hunde", {term="Hünde", q="nonstandard"}},
                                                                                          		{label = "diminutive",
                                                                                          			{term = "Hündchen", genders = {"n"}},
                                                                                          			{nolink=true, term = "Hündlein", genders = {"n"}}
                                                                                          		}
                                                                                          	},
                                                                                          	categories = {"German nouns"},
                                                                                          }
                                                                                          

                                                                                          might give (depending on the page it's run on):

                                                                                          <strong class="Latn headword" lang="de">Hund</strong>&nbsp;<span class="gender"><abbr title="masculine gender">m</abbr></span> (''genitive'' <b class="Latn" lang="de">[[Hundes#German|Hundes]]</b> ''or'' <b class="Latn" lang="de">[[Hunds#German|Hunds]]</b>, ''plural'' <b class="Latn" lang="de">[[Hunde#German|Hunde]] </b>''or (nonstandard)''<b> [[Hünde#German|Hünde]]</b>, ''diminutive'' <b class="Latn" lang="de">[[Hündchen#German|Hündchen]]</b>&nbsp;<span class="gender"><abbr title="neuter gender">n</abbr></span> ''or'' <b class="Latn" lang="de">Hündlein</b>&nbsp;<span class="gender"><abbr title="neuter gender">n</abbr></span>)[[Category:German lemmas|HEADWORD]][[Category:German nouns|HEADWORD]]
                                                                                          

                                                                                          which displays as:

                                                                                          Hund m (genitive Hundes or Hunds, plural Hunde or (nonstandard) Hünde, diminutive Hündchen n or Hündlein n)

                                                                                          An example in a non-Latin script

                                                                                          This example is in Russian, which has automatic transliteration:

                                                                                          full_headword{
                                                                                          	lang = require("Module:languages").getByCode("ru"),
                                                                                          	heads = {"кни́га"},
                                                                                          	genders = {"f-in"},
                                                                                          	inflections = {
                                                                                          		{label = "genitive", "кни́ги"},
                                                                                          		{label = "nominative plural", "кни́ги"},
                                                                                          		{label = "genitive plural", "книг"}
                                                                                          	},
                                                                                          	categories = {"Russian nouns"},
                                                                                          }
                                                                                          

                                                                                          might give (depending on the page it's run on):

                                                                                          <strong class="Cyrl headword" lang="ru">кни́га</strong> [[Wiktionary:Russian transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">kníga</span></span>)&nbsp;<span class="gender"><abbr title="feminine gender">f</abbr>&nbsp;<abbr title="inanimate">inan</abbr></span> (''genitive'' <b class="Cyrl" lang="ru">[[книги#Russian|кни́ги]]</b>, ''nominative plural'' <b class="Cyrl" lang="ru">[[книги#Russian|кни́ги]]</b>, ''genitive plural'' <b class="Cyrl" lang="ru">[[книг#Russian|книг]]</b>)[[Category:Russian lemmas|HEADWORD]][[Category:Russian nouns|HEADWORD]]
                                                                                          

                                                                                          which displays as

                                                                                          кни́га (knígaf inan (genitive кни́ги, nominative plural кни́ги, genitive plural книг)

                                                                                          Note a few things about the transliteration:

                                                                                          • If the transliteration is specified and non-empty, Module:headword adds some stuff before and after it. For example, if the transliteration is and the language is Hebrew, produces
                                                                                             [[Wiktionary:Hebrew transliteration|•]] (<span lang="">foo</span>)
                                                                                            
                                                                                            which looks like “ (foo)”.

                                                                                            A fuller example in a non-Latin script

                                                                                            This example is in Russian, with two headwords, each of which requires manual transliteration:

                                                                                            full_headword{
                                                                                            	lang = require("Module:languages").getByCode("ru"),
                                                                                            	heads = {
                                                                                            		{term = "интервьюе́р", tr = "intɛrvʹjuér"},
                                                                                            		{term = "интервью́ер", "intɛrvʹjújer"},
                                                                                            	},
                                                                                            	genders = {"m-an"},
                                                                                            	inflections = {
                                                                                            		{label = "genitive", "интервьюе́ра", "интервью́ера"},
                                                                                            		{label = "nominative plural", "интервьюе́ры", "интервью́еры"},
                                                                                            		{label = "genitive plural", "интервьюе́ров", "интервью́еров"},
                                                                                            	},
                                                                                            	categories = {"Russian nouns"},
                                                                                            }
                                                                                            

                                                                                            might give (depending on the page it's run on):

                                                                                            <strong class="Cyrl headword" lang="ru">интервьюе́р</strong> ''or'' <strong class="Cyrl headword" lang="ru">интервью́ер</strong> [[Wiktionary:Russian transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">intɛrvʹjuér</span> ''or'' <span class="tr" lang="">intɛrvʹjújer</span></span>)&nbsp;<span class="gender"><abbr title="masculine gender">m</abbr>&nbsp;<abbr title="animate">anim</abbr></span> (''genitive'' <b class="Cyrl" lang="ru">[[интервьюера#Russian|интервьюе́ра]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюера#Russian|интервью́ера]]</b>, ''nominative plural'' <b class="Cyrl" lang="ru">[[интервьюеры#Russian|интервьюе́ры]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюеры#Russian|интервью́еры]]</b>, ''genitive plural'' <b class="Cyrl" lang="ru">[[интервьюеров#Russian|интервьюе́ров]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюеров#Russian|интервью́еров]]</b>)[[Category:Russian lemmas|HEADWORD]][[Category:Russian nouns|HEADWORD]]
                                                                                            

                                                                                            which displays as

                                                                                            интервьюе́р or интервью́ер (intɛrvʹjuér or intɛrvʹjújerm anim (genitive интервьюе́ра or интервью́ера, nominative plural интервьюе́ры or интервью́еры, genitive plural интервьюе́ров or интервью́еров)

                                                                                            Another fuller example in a non-Latin script

                                                                                            This example is in Arabic, with embedded links in the headword, manual transliteration in an inflection and use of :
                                                                                            full_headword{
                                                                                            	lang = require("Module:languages").getByCode("ar"),
                                                                                            	heads = {
                                                                                            		{term = "[[غُدّة]] [[بَصَلِيّ|بَصَلِيّة]] [[إحْلِيلِيّ|إحْلِيلِيّة]]", tr = "ḡudda baṣaliyya ʾiḥlīliyya"},
                                                                                            	},
                                                                                            	genders = {"f"},
                                                                                            	inflections = {
                                                                                            		enable_auto_translit = true,
                                                                                            		{label = "plural", {term="غُدَد بَصَلِيَّة إِحْلِيلِيَة", translit="ḡudad baṣaliyya ʾiḥlīliyya"}},
                                                                                            	},
                                                                                            	categories = {"Arabic nouns"},
                                                                                            }
                                                                                            

                                                                                            might give (depending on the page it's run on):

                                                                                            <strong class="Arab headword" lang="ar">[[غدة#Arabic|غُدّة]] [[بصلي#Arabic|بَصَلِيّة]] [[إحليلي#Arabic|إحْلِيلِيّة]]</strong> [[Wiktionary:Arabic transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">ḡudda baṣaliyya ʾiḥlīliyya</span></span>)&nbsp;<span class="gender"><abbr title="feminine gender">f</abbr></span> (''plural'' <b class="Arab" lang="ar">[[غدد بصلية إحليلية#Arabic|غُدَد بَصَلِيَّة إِحْلِيلِيَة]]</b> (<span lang="" class="tr">ḡudad baṣaliyya ʾiḥlīliyya</span>))[[Category:Arabic lemmas|HEADWORD]][[Category:Arabic nouns|HEADWORD]]
                                                                                            

                                                                                            which displays as

                                                                                            غُدّة بَصَلِيّة إحْلِيلِيّة (ḡudda baṣaliyya ʾiḥlīliyyaf (plural غُدَد بَصَلِيَّة إِحْلِيلِيَة (ḡudad baṣaliyya ʾiḥlīliyya))

                                                                                            Proposed/planned changes

                                                                                            • Checking for invalid genders, given a list of genders that are valid for a particular language.

local export = {}
local m_string_utils = require("Module:string utilities")

local rfind = m_string_utils.find
local rgmatch = m_string_utils.gmatch
local rsubn = m_string_utils.gsub
local ulen = mw.ustring.len
local unfc = mw.ustring.toNFC

local m_data = mw.loadData("Module:headword/data")

local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged

-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false

-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
	return (rsubn(term, foo, bar))
end

-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-- page text to [[Template:tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
-- by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID]].
--
-- If `code` (a language or script code) is given, an additional tracking page
-- [[Template:tracking/headword/TRACK_ID/CODE]] is linked to, and you can find all entries in the combination of
-- `track_id` and `code` by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID/CODE]]. This makes it
-- possible to isolate only the entries with a specific tracking property that are in a given language or script.
local function track(track_id, code)
	local tracking_page = "headword/" .. track_id
	local m_debug_track = require("Module:debug/track")
	if code then
		m_debug_track{tracking_page, tracking_page .. "/" .. code}
	else
		m_debug_track(tracking_page)
	end
	return true
end


local function text_in_script(text, script_code)
	local sc = require("Module:scripts").getByCode(script_code)
	if not sc then
		error("Internal error: Bad script code " .. script_code)
	end
	local characters = sc:getCharacters()

	local out
	if characters then
		text = rsub(text, "%W", "")
		out = rfind(text, "[" .. characters .. "]")
	end

	if out then
		return true
	else
		return false
	end
end


local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
	 Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:"
local notWordPunc = "[^" .. wordPunc .. "]+"


-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- * right qualifiers in `qq`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
--   (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
	local left_qualifiers, right_qualifiers
	local reftext

	left_qualifiers = part.q and #part.q > 0 and part.q
	if left_qualifiers then
		left_qualifiers = require("Module:qualifier").format_qualifier(left_qualifiers) .. " "
	end

	right_qualifiers = part.qq and #part.qq > 0 and part.qq
	if right_qualifiers then
		right_qualifiers = " " .. require("Module:qualifier").format_qualifier(right_qualifiers)
	end
	if part.refs and #part.refs > 0 then
		local refs = {}
		for _, ref in ipairs(part.refs) do
			if type(ref) ~= "table" then
				ref = {text = ref}
			end
			local refargs
			if ref.name or ref.group then
				refargs = {name = ref.name, group = ref.group}
			end
			table.insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
		end
		reftext = table.concat(refs)
	end

	local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator

	if left_qualifiers then
		formatted = left_qualifiers .. formatted
	end
	if reftext then
		formatted = formatted .. reftext
	end
	if right_qualifiers then
		formatted = formatted .. right_qualifiers
	end
	if separator then
		formatted = separator .. formatted
	end

	return formatted
end


--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
	for possibleWordBreak in rgmatch(head, spacingPunctuation) do
		if rfind(possibleWordBreak, notWordPunc) then
			return true
		end
	end

	return false
end


--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
	local function workaround_to_exclude_chars(s)
		return rsub(s, notWordPunc, "\2%1\1")
	end

	head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
	if default then
		head = head
			:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
			:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
	end
	
	--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
	head = require("Module:utilities").make_entities(head, "%[%]")
	
	--[=[
	use this when workaround is no longer needed:

	head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

	Remove any empty links, which could have been created above
	at the beginning or end of the string.
	]=]
	return (head
		:gsub("\1\2", "")
		:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end


local function non_categorizable(data)
	return data.title.text:find("Main Page") or data.title:inNamespace("Appendix") and data.title.text:find("^Gestures/") 
end


-- Format a headword with transliterations.
local function format_headword(data)
	local m_scriptutils = require("Module:script utilities")

	-- Are there non-empty transliterations?
	local has_translits = false
	local has_manual_translits = false

	------ Format the headwords. ------

	local head_parts = {}
	local unique_head_parts = {}

	local has_multiple_heads = #data.heads > 1

	for j, head in ipairs(data.heads) do
		if head.tr or head.ts then
			has_translits = true
		end
		if head.tr and head.tr_manual or head.ts then
			has_manual_translits = true
		end

		local formatted

		-- Apply processing to the headword, for formatting links and such.
		if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Imag" then
			formatted = require("Module:links").language_link({term = head.term, lang = data.lang}, false)
		else
			formatted = data.lang:makeDisplayText(head.term, head.sc, true)
		end

		local function tag_text_and_add_quals_and_refs(head, formatted, j)
			-- Add language and script wrapper.
			formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
	
			-- Add qualifiers, references and separator.
			return format_term_with_qualifiers_and_refs(head, formatted, j)
		end

		local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
		table.insert(head_parts, head_part)

		-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
		-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
		if has_multiple_heads then
			local unique_head_part
			if j == 1 then
				unique_head_part = head_part
			else
				unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
			end
			unique_head_parts[unique_head_part] = true
		end
	end

	local set_size = 0
	if has_multiple_heads then
		for k, _ in pairs(unique_head_parts) do
			set_size = set_size + 1
		end
	end
	if set_size == 1 then
		head_parts = head_parts[1]
	else
		head_parts = table.concat(head_parts)
	end

	if has_manual_translits then
		-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit/LANGCODE]]
		track("has-manual-translit", data.lang:getCode())
	end

	------ Format the transliterations and transcriptions. ------

	local translits_formatted

	if has_translits then
		local translit_parts = {}
		for i, head in ipairs(data.heads) do
			if head.tr or head.ts then
				local this_parts = {}
				if head.tr then
					table.insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
					if head.ts then
						table.insert(this_parts, " ")
					end
				end
				if head.ts then
					table.insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
				end
				table.insert(translit_parts, table.concat(this_parts))
			end
		end

		translits_formatted = " (" .. table.concat(translit_parts, " <i>or</i> ") .. ")"

		local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Project")

		if transliteration_page then
			local success, exists = pcall(function () return transliteration_page.exists end)
			if success and exists then
				translits_formatted = " [[Project:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
			end
		end
	else
		translits_formatted = ""
	end

	------ Paste heads and transliterations/transcriptions. ------

	return head_parts .. translits_formatted
end


local function format_genders(data)
	if data.genders and #data.genders > 0 then
		local pos_for_cat
		if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] then
			local pos_category = data.pos_category:gsub("^reconstructed ", "")
			pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
		end
		local text, cats = require("Module:gender and number").format_genders(data.genders, data.lang, pos_for_cat)
		for _, cat in ipairs(cats) do
			table.insert(data.categories, cat)
		end
		return "&nbsp;" .. text
	else
		return ""
	end
end


local function format_inflection_parts(data, parts)
	local any_part_translit = false

	for j, part in ipairs(parts) do
		if type(part) ~= "table" then
			part = {term = part}
		end

		local partaccel = part.accel
		local face = part.hypothetical and "hypothetical" or "bold"

		-- Here the final part 'or data.nolink' allows to have 'nolink=true'
		-- right into the 'data' table to disable links of the entire headword
		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
		local nolink = part.hypothetical or part.nolink or data.nolink

		local formatted
		if part.label then
			-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
			formatted = "<i>" .. part.label .. "</i>"
		else
			-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
			-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
			-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
			-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
			-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
			-- to be enabled in languages with more complex scripts (e.g. Arabic).
			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
			if tr ~= "-" then
				any_part_translit = true
			end
			formatted = require("Module:links").full_link(
				{
					term = not nolink and part.term or nil,
					alt = part.alt or (nolink and part.term or nil),
					lang = part.lang or data.lang,
					-- FIXME, code smell in always using the first script.
					sc = part.sc or parts.sc or (not part.lang and data.heads[1].sc),
					id = part.id,
					genders = part.genders,
					tr = tr,
					ts = part.transcription,
					accel = partaccel or parts.accel,
				},
				face,
				false
				)
		end

		parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
	end

	local parts_output

	if #parts > 0 then
		parts_output = (parts.label and " " or "") .. table.concat(parts)
	elseif parts.request then
		parts_output = " <small>[please provide]</small>"
		table.insert(data.categories, "Requests for inflections in " .. data.lang:getCanonicalName() .. " entries")
	else
		parts_output = ""
	end
	
	local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
	return parts_label .. parts_output, any_part_translit
end


-- Format the inflections following the headword.
local function format_inflections(data)
	local any_part_translit = false
	if data.inflections and #data.inflections > 0 then
		-- Format each inflection individually.
		for key, infl in ipairs(data.inflections) do
			local this_any_part_translit
			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
			if this_any_part_translit then
				any_part_translit = true
			end
		end

		local concat_result = table.concat(data.inflections, ", ")
		return " (" .. concat_result .. ")"
	else
		return ""
	end
end

--[==[
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
	-- Is it a lemma category?
	if isLemma[plpos] then
		return "lemma"
	end
	local plpos_no_recon = plpos:gsub("^reconstructed ", "")
	if isLemma[plpos_no_recon] then
		return "lemma"
	end
	-- Is it a nonlemma category?
	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
		return "non-lemma form"
	end
	local plpos_no_mut = plpos:gsub("^mutated ", "")
	if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
		return "non-lemma form"
	elseif best_guess then
		return plpos:find(" forms$") and "non-lemma form" or "lemma"
	else
		return nil
	end
end


-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
local function init_and_find_maximum_index(data, element, allow_blank_string)
	local maxind = 0
	if not data[element] then
		data[element] = {}
	end
	local typ = type(data[element])
	if typ ~= "table" then
		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
	end
	for k, v in pairs(data[element]) do
		if k ~= "maxindex" then
			if type(k) ~= "number" then
				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, name))
			end
			if k > maxind then
				maxind = k
			end
			if v then
				if type(v) ~= "string" then
					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
				end
				if not allow_blank_string and v == "" then
					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
				end
			end
		end
	end
	return maxind
end

--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
-- 
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
function export.maintenance_cats(m_data, lang, lang_cats, page_cats)
	if m_data.wikitext_topic_cat[lang:getCode()] then
		table.insert(lang_cats, lang:getCanonicalName() .. " entries with topic categories using raw markup")
	end
	if m_data.wikitext_langname_cat[lang:getCanonicalName()] then
		table.insert(lang_cats, lang:getCanonicalName() .. " entries with language name categories using raw markup")
	end
	if m_data.unsupported_title then
		table.insert(page_cats, "Unsupported titles")
	end
	if m_data.pagename_defaultsort_conflict then
		table.insert(page_cats, m_data.pagename_defaultsort_conflict)
	end
	if m_data.pagename_displaytitle_conflict then
		table.insert(page_cats, m_data.pagename_displaytitle_conflict)
	end
end


--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
	local remove_links = require("Module:links").remove_links
	local format_categories = require("Module:utilities").format_categories
	
	-- Prevent data from being destructively modified.
	local data = require("Module:table").shallowcopy(data)
	
	------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

	if data.getCanonicalName then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
	end

	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
	end

	if data.id and type(data.id) ~= "string" then
		error("The id in the data table should be a string.")
	end

	------------ 2. Initialize pagename etc. ------------

	local langcode = data.lang:getCode()
	local langname = data.lang:getCanonicalName()

	if data.pagename then -- for testing, doc pages, etc.
		data.title = mw.title.new(data.pagename)
		if not data.title then
			error(("Bad value for `data.pagename`: '%s'"):format(data.pagename))
		end
	else
		data.title = mw.title.getCurrentTitle()
	end

	local pagename = data.title.text
	local namespace = data.title.nsText

	-- Check the namespace against the language type.
	if namespace == "" then
		if data.lang:hasType("reconstructed") then
			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
		elseif data.lang:hasType("appendix-constructed") then
			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
		end
	end

	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

	if type(data.heads) == "table" and type(data.heads[1]) == "table" then
		-- new-style
		if data.translits or data.transcriptions then
			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
		end
	else
		-- convert old-style `heads`, `translits` and `transcriptions` to new-style
		local maxind = math.max(
			init_and_find_maximum_index(data, "heads",true),
			init_and_find_maximum_index(data, "translits", true),
			init_and_find_maximum_index(data, "transcriptions", true)
		)
		for i = 1, maxind do
			data.heads[i] = {
				term = data.heads[i],
				tr = data.translits[i],
				ts = data.transcriptions[i],
			}
		end
	end

	-- Make sure there's at least one head.
	if not data.heads[1] then
		data.heads[1] = {}
	end

	------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------

	init_and_find_maximum_index(data, "categories")
	init_and_find_maximum_index(data, "whole_page_categories")
	local pos_category_already_present = false
	if #data.categories > 0 then
		local escaped_langname = require("Module:pattern utilities").pattern_escape(langname)
		local matches_lang_pattern = "^" .. escaped_langname .. " "
		for _, cat in ipairs(data.categories) do
			-- Does the category begin with the language name? If not, tag it with a tracking category.
			if not cat:find(matches_lang_pattern) then
				-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category]]
				-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category/LANGCODE]]
				track("no lang category", langcode)
			end
		end

		-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
		-- throw an error below.
		if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
			data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
			-- Optimization to avoid inserting category already present.
			pos_category_already_present = true
		end
	end

	if not data.pos_category then
		error("`data.pos_category` not specified and could not be inferred from the categories given in "
			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
			.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
			)
	end

	-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
	if not pos_category_already_present and not data.noposcat then
		local pos_category = langname .. " " .. data.pos_category
		-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
		if pos_category ~= "Translingual Han characters" then
			table.insert(data.categories, 1, pos_category)
		end
	end

	-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
	-- add an appropriate category.
	local postype = export.pos_lemma_or_nonlemma(data.pos_category)
	if not postype then
		-- We don't know what this category is, so tag it with a tracking category.
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/LANGCODE]]
		track("unrecognized pos", langcode)
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS/LANGCODE]]
		track("unrecognized pos/pos/" .. data.pos_category, langcode)
	elseif not data.noposcat then
		table.insert(data.categories, 1, langname .. " " .. postype .. "s")
	end

	------------ 5. Create a default headword, and add links to multiword page names. ------------

	-- Determine if term is reconstructed
	local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")

	-- Create a default headword based on the pagename, which is determined in
	-- advance by the data module so that it only needs to be done once.
	local default_head = mw.ustring.lower(m_data.pagename)
	local unmodified_default_head = default_head

	-- Add links to multi-word page names when appropriate
	if not m_data.no_multiword_links[langcode] and not is_reconstructed and export.head_is_multiword(default_head) then
		default_head = export.add_multiword_links(default_head, true)
	end

	if is_reconstructed then
		default_head = "*" .. default_head
	end

	------------ 6. Fill in missing values in `data.heads`. ------------

	-- True if any script among the headword scripts has spaces in it.
	local any_script_has_spaces = false
	-- True if any term has a redundant head= param.
	local has_redundant_head_param = false

	for _, head in ipairs(data.heads) do

		------ 6a. If missing head, replace with default head.
		if not head.term then
			head.term = default_head
		elseif head.term == default_head then
			has_redundant_head_param = true
		end

		------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.

		if not head.sc then
			if data.sc then
				-- Overall script given.
				head.sc = data.sc
			else
				-- Autodetect script.
				head.sc = data.lang:findBestScript(head.term)
			end
		end

		-- If using a discouraged character sequence, add to maintenance category.
		if head.sc:hasNormalizationFixes() == true then
			local composed_head = unfc(head.term)
			if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
				table.insert(data.whole_page_categories, "Pages using discouraged character sequences")
			end
		end

		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).

		-- Make transliterations
		head.tr_manual = nil

		-- Try to generate a transliteration if necessary
		if head.tr == "-" then
			head.tr = nil
		elseif not notranslit[langcode] and head.sc:isTransliterated() then
			head.tr_manual = not not head.tr
			
			local text = head.term
			if not data.lang:link_tr() then
				text = remove_links(text)
			end
			
			local automated_tr, tr_categories
			automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
			
			if automated_tr or head.tr_fail then
				local manual_tr = head.tr

				if manual_tr then
					if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
						table.insert(data.categories, "Terms with redundant transliterations")
						table.insert(data.categories, "Terms with redundant transliterations/" .. langcode)
					elseif not head.tr_fail then
						table.insert(data.categories, "Terms with manual transliterations different from the automated ones")
						table.insert(data.categories, "Terms with manual transliterations different from the automated ones/" .. langcode)
					end
				end

				if not manual_tr then
					head.tr = automated_tr
					for _, category in ipairs(tr_categories) do
						table.insert(data.categories, category)
					end
				end
			end

			-- There is still no transliteration?
			-- Add the entry to a cleanup category.
			if not head.tr then
				head.tr = "<small>transliteration needed</small>"
				table.insert(data.categories, "Requests for transliteration of " .. langname .. " terms")
			else
				-- Otherwise, trim it.
				head.tr = mw.text.trim(head.tr)
			end
		end

		-- Link to the transliteration entry for languages that require this.
		if head.tr and data.lang:link_tr() then
			head.tr = require("Module:links").full_link {
				term = head.tr,
				lang = data.lang,
				sc = require("Module:scripts").getByCode("Latn"),
				tr = "-"
			}
		end
	end

	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
	-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
	-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
	-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
	local display_title
	-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
	-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
	-- about the best we can do (alternatively we could potentially do script detection on the pagename).
	local dt_script = data.heads[1].sc
	local dt_script_code = dt_script:getCode()
	local page_non_ascii = namespace == "" and not pagename:find("^[%z\1-\127]+$")
	local unsupported_pagename, unsupported = pagename:gsub("^Unsupported titles/", "")
	if unsupported == 1 and m_data.unsupported_titles[unsupported_pagename] then
		display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. m_data.unsupported_titles[unsupported_pagename] .. '</span>'
	elseif page_non_ascii and toBeTagged[dt_script_code]
		or (dt_script_code == "Jpan" and (text_in_script(pagename, "Hira") or text_in_script(pagename, "Kana")))
		or (dt_script_code == "Kore" and text_in_script(pagename, "Hang")) then
		display_title = '<span class="' .. dt_script_code .. '">' .. pagename .. '</span>'
	-- Keep Han entries region-neutral in the display title.
	elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
		display_title = '<span class="Hani">' .. pagename .. '</span>'
	elseif namespace == "Reconstruction" then
		display_title, matched = rsubn(
			data.title.fullText,
			"^(Reconstruction:[^/]+/)(.+)$",
			function(before, term)
				return before ..
					require("Module:script utilities").tag_text(
						term,
						data.lang,
						dt_script
					)
			end
		)
		if matched == 0 then
			display_title = nil
		end
	end
	
	if display_title then
		mw.getCurrentFrame():callParserFunction(
			"DISPLAYTITLE",
			display_title
		)
	end

	------------ 8. Insert additional categories. ------------
	
	if data.force_cat_output then
		-- [[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
		track("force cat output")
	end

	if has_redundant_head_param then
		if not data.no_redundant_head_cat then
			table.insert(data.categories, langname .. " terms with redundant head parameter")
		end
	end

	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and not m_data.no_multiword_cat[langcode] then
		-- Check for spaces or hyphens, but exclude prefixes and suffixes.
		-- Use the pagename, not the head= value, because the latter may have extra
		-- junk in it, e.g. superscripted text that throws off the algorithm.
		local checkpattern = ".[%s%-፡]."
		if m_data.hyphen_not_multiword_sep[langcode] then
			-- Exclude hyphens if the data module states that they should for this language
			checkpattern = ".[%s፡]."
		end
		if rfind(unmodified_default_head, checkpattern) and not non_categorizable(data) then
			table.insert(data.categories, langname .. " multiword terms")
		end
	end

	if data.sccat then
		for _, head in ipairs(data.heads) do
			table.insert(data.categories, langname .. " " .. data.pos_category .. " in " .. head.sc:getDisplayForm())
		end
	end

	-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
	local standard = data.lang:getStandardCharacters()
	if standard and not non_categorizable(data) then
		local function char_category(char)
			local specials = {["#"] = "number sign", ["<"] = "less-than sign", [">"] = "greater-than sign", ["["] = "left square bracket", ["]"] = "right square bracket", ["_"] = "underscore", ["{"] = "left curly bracket", ["|"] = "vertical line", ["}"] = "right curly bracket", ["ß"] = "ẞ", ["ͅ"] = "ͅ", ["\239\191\189"] = "replacement character"}
			char = mw.ustring.toNFD(char)
				:gsub("[%z\1-\127\194-\244][\128-\191]*", function(m)
					local new_m = specials[m]
					new_m = new_m or m:uupper()
					return new_m
				end)
			return mw.ustring.toNFC(char)
		end
		if data.lang:getCode() ~= "lo" then
			local standard_chars_scripts = {}
			for _, head in ipairs(data.heads) do
				standard_chars_scripts[head.sc:getCode()] = true
			end
			-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
			for code in pairs(standard_chars_scripts) do
				local sc_standard = data.lang:getStandardCharacters(code)
				if sc_standard then
					if m_data.pagename_len > 1 then
						local explode_standard = {}
						local function explode(char)
							explode_standard[char] = true
							return ""
						end
						local sc_standard = rsub(sc_standard, m_data.comb_chars.combined_double, explode)
						sc_standard = rsub(sc_standard, m_data.comb_chars.combined_single, explode)
							:gsub("[%z\1-\127\194-\244][\128-\191]*", explode)
						local num
						for char in pairs(m_data.explode_pagename) do
							if not explode_standard[char] then
								if char:find("[0-9]") then
									if not num then
										table.insert(data.categories, langname .. " terms spelled with numbers")
									end
								else
									local upper = char_category(char)
									if not explode_standard[upper] then
										char = upper
									end
									table.insert(data.categories, langname .. " terms spelled with " .. char)
								end
							end
						end
					end
					-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
					sc_standard = mw.ustring.toNFD(sc_standard)
					for diacritic in rgmatch(m_data.decompose_pagename, m_data.comb_chars.diacritics_single) do
						if not mw.ustring.find(sc_standard, diacritic) then
							table.insert(data.categories, langname .. " terms spelled with ◌" .. diacritic)
						end
					end
					for diacritic in rgmatch(m_data.decompose_pagename, m_data.comb_chars.diacritics_double) do
						if not mw.ustring.find(sc_standard, diacritic) then
							table.insert(data.categories, langname .. " terms spelled with ◌" .. diacritic .. "◌")
						end
					end
				end
			end
		-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
		elseif ulen(m_data.pagename) ~= 1 then
			for character in rgmatch(m_data.pagename, "([^" .. standard .. "])") do
				local upper = char_category(character)
				if not rfind(upper, "[" .. standard .. "]") then
					character = upper
				end
				table.insert(data.categories, langname .. " terms spelled with " .. character)
			end
		end
	end

	-- Categorise for palindromes
	if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(data.title.subpageText) > 2
		-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
		-- multiple scripts?
		and require("Module:palindromes").is_palindrome(data.title.subpageText, data.lang, data.heads[1].sc) then
		table.insert(data.categories, langname .. " palindromes")
	end

	if namespace == "" and not data.lang:hasType("reconstructed") then
		local m_links = require("Module:links")
		for _, head in ipairs(data.heads) do
			if data.title.prefixedText ~= m_links.getLinkPage(remove_links(head.term), data.lang, head.sc) then
				-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch]]
				-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch/LANGCODE]]
				track("pagename spelling mismatch", data.lang:getCode())
				break
			end
		end
	end
	
	-- Add to various maintenance categories.
	export.maintenance_cats(m_data, data.lang, data.categories, data.whole_page_categories)
	
	------------ 9. Format and return headwords, genders, inflections and categories. ------------

	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
	-- so make sure we do it before evaluating `data.categories`.
	local text =
		format_headword(data) ..
		format_genders(data) ..
		format_inflections(data)
	
	-- Language-specific categories.
	local cats = format_categories(
		data.categories, data.lang, data.sort_key, m_data.encoded_pagename,
		data.force_cat_output or test_force_categories, data.heads[1].sc
	)
	-- Language-agnostic categories.
	local whole_page_cats = format_categories(
		data.whole_page_categories, nil, "-"
	)
	return text .. cats .. whole_page_cats
end

return export