Module:Data consistency check: Difference between revisions
(Added plain Languages/data. Cross your fingers.) |
Djpwikiadmin (talk | contribs) No edit summary |
||
Line 1: | Line 1: | ||
local export = {} | local export = {} | ||
local m_language_data = require("Module:languages/ | local m_language_data = require("Module:languages/data/all") | ||
local m_language_codes = require( | local m_language_codes = require("Module:languages/code to canonical name") | ||
local m_language_canonical_names = require( | local m_language_canonical_names = require("Module:languages/canonical names") | ||
local m_etym_language_data = require("Module:etymology languages/data") | local m_etym_language_data = require("Module:etymology languages/data") | ||
local m_family_data = require( | local m_etym_language_codes = require("Module:etymology languages/code to canonical name") | ||
local m_script_data = require( | local m_etym_language_canonical_names = require("Module:etymology languages/canonical names") | ||
local m_family_data = require("Module:families/data") | |||
local m_family_codes = require("Module:families/code to canonical name") | |||
local m_family_canonical_names = require("Module:families/canonical names") | |||
local m_script_data = require("Module:scripts/data") | |||
local m_languages = require("Module:languages") | |||
local m_links = require("Module:links") | |||
local m_scripts = require("Module:scripts") | |||
local m_script_utils = require("Module:script utilities") | |||
local m_table = require("Module:table") | local m_table = require("Module:table") | ||
local Array = require("Module:array") | local Array = require("Module:array") | ||
Line 14: | Line 22: | ||
local function discrepancy(modname, ...) | local function discrepancy(modname, ...) | ||
messages[modname]:insert(string.format(...)) | local ok, result = pcall(function(...) messages[modname]:insert(string.format(...)) end, ...) | ||
if not ok then | |||
mw.log(result, ...) | |||
end | |||
end | end | ||
Line 57: | Line 68: | ||
end) | end) | ||
:concat(", "), | :concat(", "), | ||
(is_script and link_script or link)( | (is_script and link_script or link)(data[1]), | ||
code, | code, | ||
plural and "are" or "is") | plural and "are" or "is") | ||
Line 90: | Line 101: | ||
end | end | ||
local function check_array(modname, code, data, array_name) | local function check_true_or_nil(modname, code, data, field_name) | ||
local gap = find_gap( | if not (data[field_name] == nil or data[field_name] == true) then | ||
discrepancy(modname, | |||
"%s (<code>%s</code>) has an <code>%s</code> value that is not <code>nil</code> or <code>true</code>: %s", | |||
array_name, data. | link(canonical_name), code, field_name, | ||
tostring(data.link_tr) | |||
) | |||
end | |||
end | |||
local function check_array(modname, code, data, array_name, subarray_name) | |||
local subtable = data | |||
if subarray_name then | |||
subtable = assert(data[subarray_name], subarray_name) | |||
end | |||
local array_type = type(subtable[array_name]) | |||
if array_type == "table" then | |||
local gap = find_gap(subtable[array_name]) | |||
if gap then | |||
discrepancy(modname, "The %s array in %sthe data table for %s (<code>%s</code>) has a gap at index %d.", | |||
array_name, | |||
subarray_name and "the " .. subarray_name .. " field in " or "", | |||
data[1], | |||
code, gap) | |||
else | |||
return true | |||
end | |||
else | |||
discrepancy(modname, "The %s field in %sthe data table for %s (<code>%s</code>) should be an array (table) but is %s.", | |||
array_name, | |||
subarray_name and "the " .. subarray_name .. " field in " or "", | |||
data[1], | |||
code, | |||
array_type == "nil" and "nil" or "a " .. array_type) | |||
end | |||
end | |||
local function check_wikidata_item(modname, code, data, key) | |||
local data_item = data[key] | |||
if data_item == nil then | |||
return | |||
elseif type(data_item) == "number" then | |||
if not require "Module:table".isPositiveInteger(data_item) then | |||
discrepancy(modname, "%g, the Wikidata item id for %s (<code>%s</code>), is not a positive integer or a string in the correct format.", | |||
data_item, data[1], code) | |||
end | |||
elseif type(data_item) == "string" then | |||
if not data_item:find "^Q%d+$" then | |||
discrepancy(modname, "%s, the Wikidata item id for %s (<code>%s</code>), is not a string in the correct format or a positive integer.", | |||
data_item, data[1], code) | |||
end | |||
end | end | ||
end | end | ||
Line 153: | Line 210: | ||
if type(pattern) ~= "string" then | if type(pattern) ~= "string" then | ||
discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.', | discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.', | ||
pattern, standardChars and 'standard character ' or '', code, data | pattern, standardChars and 'standard character ' or '', code, data[1]) | ||
end | end | ||
local ranges | local ranges | ||
for lower, higher in mw.ustring.gmatch(pattern, "(.)%-(.)") do | for lower, higher in mw.ustring.gmatch(pattern, "(.)%-%%?(.)") do | ||
if get_codepoint(lower) >= get_codepoint(higher) then | if get_codepoint(lower) >= get_codepoint(higher) then | ||
ranges = ranges or Array() | ranges = ranges or Array() | ||
Line 167: | Line 224: | ||
'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' .. | 'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' .. | ||
'in the range%s %s %s must be less than the second.', | 'in the range%s %s %s must be less than the second.', | ||
link(data | link(data[1]), code, standardChars and 'standard ' or '', pattern, plural, plural, | ||
ranges | ranges | ||
:map( | :map( | ||
Line 183: | Line 240: | ||
end | end | ||
end | end | ||
local remove_exceptions_addition = 0xF0000 | |||
local maximum_code_point = 0x10FFFF | |||
local remove_exceptions_maximum_code_point = maximum_code_point - remove_exceptions_addition | |||
local function check_entry_name_or_sortkey(modname, code, data, replacements_name) | local function check_entry_name_or_sortkey(modname, code, data, replacements_name) | ||
local canonical_name = data[1] | |||
local replacements = data[replacements_name] | local replacements = data[replacements_name] | ||
if type(replacements) == "string" then | if type(replacements) == "string" then | ||
if replacements_name | if not (replacements_name == "sort_key" or replacements_name == "entry_name") then | ||
discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.", | discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.", | ||
replacements_name, | replacements_name, canonical_name, code) | ||
end | end | ||
return | return | ||
Line 197: | Line 260: | ||
discrepancy(modname, | discrepancy(modname, | ||
"The <code>from</code> and <code>to</code> arrays in the <code>%s</code> table for %s (<code>%s</code>) are not both defined or both undefined.", | "The <code>from</code> and <code>to</code> arrays in the <code>%s</code> table for %s (<code>%s</code>) are not both defined or both undefined.", | ||
replacements_name, | replacements_name, canonical_name, code) | ||
elseif replacements.from then | elseif replacements.from then | ||
for _, key in ipairs { "from", "to" } do | for _, key in ipairs { "from", "to" } do | ||
check_array(modname, code, data, key, replacements_name) | |||
end | end | ||
end | end | ||
Line 212: | Line 270: | ||
discrepancy(modname, | discrepancy(modname, | ||
"The <code>remove_diacritics</code> field in the <code>%s</code> table for %s (<code>%s</code>) table must be a string.", | "The <code>remove_diacritics</code> field in the <code>%s</code> table for %s (<code>%s</code>) table must be a string.", | ||
replacements_name, data. | replacements_name, canonical_name, code) | ||
end | |||
if replacements.remove_exceptions then | |||
if check_array(modname, code, data, "remove_exceptions", replacements_name) then | |||
for sequence_i, sequence in ipairs(replacements.remove_exceptions) do | |||
local code_point_i = 0 | |||
for code_point in mw.ustring.gcodepoint(sequence) do | |||
code_point_i = code_point_i + 1 | |||
if code_point > remove_exceptions_maximum_code_point then | |||
discrepancy(modname, | |||
"Code point #%d (0x%04X) in field #%d of the <code>remove_exceptions</code> array for %s (<code>%s</code>) is over U+%04X.", | |||
code_point_i, code_point, sequence_i, canonical_name, code, remove_exceptions_maximum_code_point) | |||
end | |||
end | |||
end | |||
end | |||
end | end | ||
Line 219: | Line 294: | ||
discrepancy(modname, | discrepancy(modname, | ||
"The <code>from</code> array in the <code>%s</code> table for %s (<code>%s</code>) must be shorter or the same length as the <code>to</code> array.", | "The <code>from</code> array in the <code>%s</code> table for %s (<code>%s</code>) must be shorter or the same length as the <code>to</code> array.", | ||
replacements_name, | replacements_name, canonical_name, code) | ||
end | end | ||
end | end | ||
Line 238: | Line 313: | ||
local function check_ancestors(modname, code, data, ancestors, is_etymology_language) | local function check_ancestors(modname, code, data, ancestors, is_etymology_language) | ||
if type(ancestors) == "string" then ancestors = mw.text.split(ancestors, "%s*,%s*") end | |||
local canonical_name = data[1] | local canonical_name = data[1] | ||
for _, ancestor_code in ipairs(ancestors) do | |||
if not | if not (m_language_data[ancestor_code] or m_etym_language_data[ancestor_code]) then | ||
discrepancy(modname, | discrepancy(modname, | ||
" | "%s (<code>%s</code>) lists an invalid language code <code>%s</code> as ancestor.", | ||
link(canonical_name), code, ancestor_code) | |||
link(canonical_name), code) | |||
end | end | ||
end | end | ||
end | |||
local function check_code_to_name_and_name_to_code_maps( | |||
source_module_description, | |||
code_to_module_map, name_to_code_map, | |||
code_to_name_modname, code_to_name_module, | |||
name_to_code_modname, name_to_code_module) | |||
local function check_code_and_name(modname, code, canonical_name) | |||
if not code_to_module_map[code] then | |||
if not name_to_code_map[canonical_name] then | |||
discrepancy(modname, | |||
"The code <code>%s</code> and the canonical name %s should be removed; they are not found in %s.", | |||
code, canonical_name, source_module_description) | |||
else | |||
discrepancy(modname, | |||
"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.", | |||
code, canonical_name, name_to_code_map[canonical_name]) | |||
end | |||
elseif not name_to_code_map[canonical_name] then | |||
local data_table = require("Module:" .. code_to_module_map[code])[code] | |||
discrepancy(modname, | discrepancy(modname, | ||
"%s | "%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.", | ||
canonical_name, code, data_table[1]) | |||
end | |||
end | |||
for code, canonical_name in pairs(code_to_name_module) do | |||
check_code_and_name(code_to_name_modname, code, canonical_name) | |||
end | |||
for canonical_name, code in pairs(name_to_code_module) do | |||
check_code_and_name(name_to_code_modname, code, canonical_name) | |||
end | |||
end | |||
local function check_extraneous_extra_data( | |||
data_modname, data_module, extra_data_modname, extra_data_module) | |||
for code, _ in pairs(extra_data_module) do | |||
if not data_module[code] then | |||
discrepancy(extra_data_modname, | |||
"Language code <code>%s</code> is not found in [[Module:%s]], and should be removed from [[Module:%s]].", | |||
code, data_modname, extra_data_modname | |||
) | |||
end | end | ||
end | |||
end | |||
-- Just trying to not have a module error when someone puts a script code | |||
-- in the position of a language code. | |||
local function show_family_code(code) | |||
if type(code) == "string" then | |||
return "<code>" .. code .. "</code>" | |||
else | |||
return require("Module:debug").highlight_dump(code) | |||
end | end | ||
end | end | ||
Line 261: | Line 382: | ||
local function check_languages() | local function check_languages() | ||
local check_language_data_keys = check_data_keys{ | local check_language_data_keys = check_data_keys{ | ||
1, 2, 3, -- canonical name, wikidata item, family | 1, 2, 3, 4, -- canonical name, wikidata item, family, scripts | ||
"entry_name", "sort_key", "otherNames", "aliases", "varieties", | "display_text", "generate_forms", "entry_name", "sort_key", | ||
"type | "otherNames", "aliases", "varieties", | ||
"type", "ancestors", | |||
"wikimedia_codes", "wikipedia_article", "standardChars", | "wikimedia_codes", "wikipedia_article", "standardChars", | ||
" | "translit", "override_translit", "link_tr", | ||
"dotted_dotless_i" | |||
} | } | ||
local function check_language(modname, code, data) | local function check_language(modname, code, data, mainData, extraData) | ||
local canonical_name | local canonical_name, lang_type = data[1], data.type | ||
check_language_data_keys(modname, code, data) | check_language_data_keys(modname, code, data) | ||
Line 295: | Line 418: | ||
end | end | ||
if | check_wikidata_item(modname, code, data, 2) | ||
if extraData then | |||
check_other_names_aliases_varieties(modname, code, canonical_name, extraData) | |||
end | end | ||
if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then | if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then | ||
Line 309: | Line 428: | ||
end | end | ||
if | if mainData.aliases then | ||
discrepancy(modname, "%s (<code>%s</code>) has the <code>aliases</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code) | |||
if not data. | end | ||
if mainData.varieties then | |||
discrepancy(modname, "%s (<code>%s</code>) has the <code>varieties</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code) | |||
end | |||
if mainData.otherNames then | |||
discrepancy(modname, "%s (<code>%s</code>) has the <code>otherNames</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code) | |||
end | |||
if not extraData then | |||
discrepancy(modname .. "/extra", "%s (<code>%s</code>) has data in [[Module:" .. modname .. "]], but does not have corresponding data in [[Module:" .. modname .. "/extra]].", link(canonical_name), code) | |||
--elseif extraData.otherNames then | |||
-- discrepancy(modname .. "/extra", "%s (<code>%s</code>) has <code>otherNames</code> key, but these should be changed to either <code>aliases</code> or <code>varieties</code>.", link(canonical_name), code) | |||
end | |||
local sc = data[4] | |||
if sc then | |||
if type(sc) == "string" then | |||
sc = mw.text.split(sc, "%s*,%s*") | |||
end | |||
if type(sc) == "table" then | |||
if not sc[1] then | |||
discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(canonical_name), code) | |||
else | |||
for _, sccode in ipairs(sc) do | |||
local cur_sc = m_script_data[sccode] | |||
if not cur_sc and sccode ~= "All" then | |||
discrepancy(modname, | |||
"%s (<code>%s</code>) lists an invalid script code <code>%s</code>.", | |||
link(canonical_name), code, sccode) | |||
-- elseif not cur_sc.characters then | |||
-- discrepancy(modname, | |||
-- "%s (<code>%s</code>) lists a script without characters <code>%s</code> (%s).", | |||
-- link(canonical_name), code, sccode, cur_sc[1]) | |||
end | |||
nonempty_scripts[sccode] = true | |||
end | end | ||
end | end | ||
else | |||
discrepancy(modname, | |||
"The %s field for %s (<code>%s</code>) must be a table or string.", | |||
4, link(canonical_name), code) | |||
end | end | ||
end | end | ||
Line 334: | Line 485: | ||
if not m_family_data[family] then | if not m_family_data[family] then | ||
discrepancy(modname, | discrepancy(modname, | ||
"%s (<code>%s</code>) has an invalid family code | "%s (<code>%s</code>) has an invalid family code %s.", | ||
link(canonical_name), code, family) | link(canonical_name), code, show_family_code(family)) | ||
end | end | ||
Line 347: | Line 498: | ||
if data.entry_name then | if data.entry_name then | ||
check_entry_name_or_sortkey(modname, code, data, "entry_name") | check_entry_name_or_sortkey(modname, code, data, "entry_name") | ||
end | |||
if data.display then | |||
check_entry_name_or_sortkey(modname, code, data, "display") | |||
end | end | ||
if data.standardChars then | if data.standardChars then | ||
if type(data.standardChars) == "table" then | |||
local sccodes = {} | |||
for _, sccode in ipairs(sc) do | |||
sccodes[sccode] = true | |||
end | |||
for sccode in pairs(data.standardChars) do | |||
if not (sccodes[sccode] or sccode == 1) then | |||
discrepancy(modname, "The field %s in the standardChars table for %s (<code>%s</code>) does not match any script for that language.", | |||
sccode, canonical_name, code) | |||
end | |||
end | |||
elseif data.standardChars and type(data.standardChars) ~= "string" then | |||
discrepancy(modname, "The standardChars field in the data table for %s (<code>%s</code>) must be a string or table.", | |||
canonical_name, code) | |||
end | |||
end | |||
check_true_or_nil(modname, code, data, "override_translit") | |||
check_true_or_nil(modname, code, data, "link_tr") | |||
if data.override_translit and not data.translit then | |||
discrepancy(modname, | |||
"%s (<code>%s</code>) has <code>override_translit</code> set, but no transliteration module", | |||
link(canonical_name), code) | |||
end | end | ||
end | end | ||
-- Check two-letter codes | -- Check two-letter codes | ||
local modname = "languages/ | local modname = "languages/data/2" | ||
local data2 = | local data2 = mw.loadData("Module:" .. modname) | ||
local extra_modname = modname .. "/extra" | |||
local extradata2 = mw.loadData("Module:" .. extra_modname) | |||
for code, data in pairs(data2) do | for code, data in pairs(data2) do | ||
if not code:find("^[a-z][a-z]$") then | if not code:find("^[a-z][a-z]$") then | ||
discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data | discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data[1]), code) | ||
end | end | ||
check_language(modname, code, data) | check_language(modname, code, data, data2[code], extradata2[code]) | ||
end | end | ||
check_extraneous_extra_data(modname, data2, extra_modname, extradata2) | |||
-- Check three-letter codes | -- Check three-letter codes | ||
for i = string.byte('a'), string.byte('z') do | for i = string.byte('a'), string.byte('z') do | ||
local letter = string.char(i) | local letter = string.char(i) | ||
local modname = "languages/ | local modname = "languages/data/3/" .. letter | ||
local data3 = | local data3 = mw.loadData("Module:" .. modname) | ||
local extra_modname = modname .. "/extra" | |||
local extradata3 = mw.loadData("Module:" .. extra_modname) | |||
local code_pattern = "^" .. letter .. "[a-z][a-z]$" | local code_pattern = "^" .. letter .. "[a-z][a-z]$" | ||
Line 377: | Line 560: | ||
discrepancy(modname, | discrepancy(modname, | ||
'%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".', | '%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".', | ||
link(data | link(data[1]), code, letter) | ||
end | end | ||
check_language(modname, code, data | check_language(modname, code, data, data3[code], extradata3[code]) | ||
end | end | ||
check_extraneous_extra_data(modname, data3, extra_modname, extradata3) | |||
end | end | ||
-- Check exceptional codes | -- Check exceptional codes | ||
modname = "languages/ | modname = "languages/data/exceptional" | ||
local datax = | local datax = mw.loadData("Module:" .. modname) | ||
extra_modname = modname .. "/extra" | |||
local extradatax = mw.loadData("Module:" .. extra_modname) | |||
for code, data in pairs(datax) do | for code, data in pairs(datax) do | ||
if code:find("^[a-z][a-z][a-z]?$") then | if code:find("^[a-z][a-z][a-z]?$") then | ||
discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data | discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data[1]), code) | ||
end | end | ||
check_language(modname, code, data) | check_language(modname, code, data, datax[code], extradatax[code]) | ||
end | end | ||
check_extraneous_extra_data(modname, datax, extra_modname, extradatax) | |||
-- These checks must be done while all_codes only contains language codes: | -- These checks must be done while all_codes only contains language codes: | ||
-- that is, after language data modules have been processed, but before | -- that is, after language data modules have been processed, but before | ||
-- etymology languages, families, and scripts have. | -- etymology languages, families, and scripts have. | ||
local | check_code_to_name_and_name_to_code_maps( | ||
if not | "a submodule of [[Module:languages]]", | ||
all_codes, language_names, | |||
"languages/code to canonical name", m_language_codes, | |||
" | "languages/canonical names", m_language_canonical_names) | ||
-- Check [[Template:langname-lite]] | |||
discrepancy( | local frame = mw.getCurrentFrame() | ||
local content = mw.title.new("Template:langname-lite"):getContent() | |||
content = content:gsub("%<%!%-%-.-%-%-%>", "") -- remove comments | |||
local match = mw.ustring.gmatch(content, "\n\t*|#*([^\n]+)=([^\n]*)") | |||
while true do | |||
local code, name, check_name = match() | |||
if not code then return "OK" end | |||
if code:len() > 1 and code ~= "default" then | |||
for _, code in pairs(mw.text.split(code, "|")) do | |||
local lang = m_languages.getByCode(code, nil, true, true) | |||
if name:match("etymcode") then | |||
local nonEtym_name = frame:preprocess(name) | |||
local nonEtym_real_name = lang:getNonEtymologicalName() | |||
if nonEtym_name ~= nonEtym_real_name then | |||
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. nonEtym_name .. ". Expected name: " .. nonEtym_real_name .. ".") | |||
end | |||
name = frame:preprocess(name:gsub("{{{allow etym|}}}", "1")) | |||
elseif name:match("familycode") then | |||
name = name:match("familycode|(.-)|") | |||
else | |||
name = name | |||
end | |||
if not lang then | |||
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Language not present in data.") | |||
else | |||
local real_name = lang:getCanonicalName() | |||
if name ~= real_name then | |||
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Expected name: " .. real_name .. ".") | |||
end | |||
end | |||
end | end | ||
end | end | ||
end | end | ||
end | end | ||
Line 443: | Line 632: | ||
local check_etymology_language_data_keys = check_data_keys{ | local check_etymology_language_data_keys = check_data_keys{ | ||
" | 1, 2, 3, 4, 5, -- canonical name, wikidata item, family, scripts, parent | ||
"wikipedia_article", " | "display_text", "generate_forms", "entry_name", "sort_key", | ||
"otherNames", "aliases", "varieties", | |||
"type", "main_code", "ancestors", | |||
"wikimedia_codes", "wikipedia_article", "standardChars", | |||
"translit", "override_translit", "link_tr", | |||
"dotted_dotless_i" | |||
} | } | ||
Line 458: | Line 652: | ||
for code, data in pairs(m_etym_language_data) do | for code, data in pairs(m_etym_language_data) do | ||
local canonical_name, parent | local canonical_name, parent = | ||
data | data[1], data[5] | ||
check_etymology_language_data_keys(modname, code, data) | check_etymology_language_data_keys(modname, code, data) | ||
Line 465: | Line 659: | ||
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code]) | discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code]) | ||
else | else | ||
if not m_etym_language_codes[code] then | |||
discrepancy("etymology languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name) | |||
end | |||
all_codes[code] = modname | all_codes[code] = modname | ||
end | end | ||
Line 471: | Line 668: | ||
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code) | discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code) | ||
elseif language_names[canonical_name] then | elseif language_names[canonical_name] then | ||
local m_canonical_lang = m_languages.getByCanonicalName(canonical_name, nil, true) | |||
discrepancy(modname, | if not m_canonical_lang then | ||
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.", | discrepancy(modname, "%s (<code>%s</code>) has a canonical name that cannot be looked up.", | ||
link(canonical_name), code) | |||
elseif data.main_code ~= m_canonical_lang:getCode() then | |||
discrepancy(modname, | |||
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.", | |||
link(canonical_name), code, language_names[canonical_name]) | |||
end | |||
else | else | ||
if not m_etym_language_canonical_names[canonical_name] then | |||
discrepancy("etymology languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code) | |||
end | |||
language_names[canonical_name] = code | language_names[canonical_name] = code | ||
end | end | ||
Line 492: | Line 696: | ||
link(canonical_name), code, parent) | link(canonical_name), code, parent) | ||
end | end | ||
nonempty_families[parent] = true | nonempty_families[parent] = true | ||
else | else | ||
Line 500: | Line 703: | ||
end | end | ||
if ancestors then | if data.ancestors then | ||
check_ancestors(modname, code, data, ancestors, true | check_ancestors(modname, code, data, data.ancestors, false) | ||
end | |||
if data[3] then | |||
local family = data[3] | |||
if not m_family_data[family] then | |||
discrepancy(modname, | |||
"%s (<code>%s</code>) has an invalid family code %s.", | |||
link(canonical_name), code, show_family_code(family)) | |||
end | |||
nonempty_families[family] = true | |||
end | end | ||
check_wikidata_item(modname, code, data, 2) | |||
end | end | ||
Line 515: | Line 730: | ||
if stack[data] then | if stack[data] then | ||
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)", | discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)", | ||
link(data[1] | link(data[1]), code, | ||
link(m_etym_language_data[data | link(m_etym_language_data[data[5]][1]), data.parent or data[5] | ||
) | ) | ||
break | break | ||
end | end | ||
stack[data] = true | stack[data] = true | ||
code, data = data | code, data = data[5], data[5] and m_etym_language_data[data[5]] | ||
end | end | ||
Line 528: | Line 743: | ||
end | end | ||
end | end | ||
check_code_to_name_and_name_to_code_maps( | |||
"[[Module:etymology languages/data]]", | |||
all_codes, language_names, | |||
"etymology languages/code to canonical name", m_etym_language_codes, | |||
"etymology languages/canonical names", m_etym_language_canonical_names) | |||
end | end | ||
Line 534: | Line 755: | ||
local check_family_data_keys = check_data_keys{ | local check_family_data_keys = check_data_keys{ | ||
1, 2, 3, -- canonical name, wikidata item, (parent) family | |||
"type", | |||
"protoLanguage", "otherNames", "aliases", "varieties", | |||
} | } | ||
Line 550: | Line 772: | ||
for code, data in pairs(m_family_data) do | for code, data in pairs(m_family_data) do | ||
check_family_data_keys(modname, code, data) | check_family_data_keys(modname, code, data) | ||
local canonical_name, family = data[1], data[3] | |||
if all_codes[code] then | if all_codes[code] then | ||
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code]) | discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code]) | ||
else | else | ||
if not m_family_codes[code] then | |||
discrepancy("families/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name) | |||
end | |||
all_codes[code] = modname | all_codes[code] = modname | ||
end | end | ||
if not | if not canonical_name then | ||
discrepancy(modname, "<code>%s</code> has no canonical name specified.", code) | discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code) | ||
elseif family_names[ | elseif family_names[canonical_name] then | ||
discrepancy(modname, | discrepancy(modname, | ||
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.", | "%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.", | ||
link( | link(canonical_name), code, family_names[canonical_name]) | ||
else | else | ||
family_names[data | if not m_family_canonical_names[canonical_name] then | ||
discrepancy("families/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code) | |||
end | |||
family_names[canonical_name] = code | |||
end | |||
if data[2] and type(data[2]) ~= "number" then | |||
discrepancy(modname, "%s (<code>%s</code>) has a wikidata item value that is not a number or <code>nil</code>: %s", link(canonical_name), code, mw.dumpObject(data[2])) | |||
end | end | ||
check_other_names_aliases_varieties(modname, code, | check_other_names_aliases_varieties(modname, code, canonical_name, data) | ||
if | if family then | ||
if | if family == code and code ~= "qfa-not" then | ||
discrepancy(modname, | discrepancy(modname, | ||
"%s (<code>%s</code>) has itself as its family.", | "%s (<code>%s</code>) has itself as its family.", | ||
link( | link(canonical_name), code) | ||
elseif not m_family_data[ | elseif not m_family_data[family] then | ||
discrepancy(modname, | discrepancy(modname, | ||
"%s (<code>%s</code>) has an invalid parent family code | "%s (<code>%s</code>) has an invalid parent family code %s.", | ||
link( | link(canonical_name), code, show_family_code(family)) | ||
end | end | ||
nonempty_families[ | nonempty_families[family] = true | ||
end | end | ||
check_wikidata_item(modname, code, data, 2) | |||
end | end | ||
for code, data in pairs(m_family_data) do | for code, data in pairs(m_family_data) do | ||
if not (nonempty_families[code] or allowed_empty_families[code]) then | if not (nonempty_families[code] or allowed_empty_families[code]) then | ||
discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data | discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data[1]), code) | ||
end | end | ||
end | end | ||
Line 600: | Line 836: | ||
if stack[code] then | if stack[code] then | ||
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)", | discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)", | ||
link(data[1] | link(data[1]), code, | ||
link(m_family_data[data[3]] | link(m_family_data[data[3]][1]), data[3] | ||
) | ) | ||
break | break | ||
end | end | ||
stack[code] = true | stack[code] = true | ||
code, data = data | code, data = data[3], m_family_data[data[3]] | ||
end | end | ||
Line 613: | Line 849: | ||
end | end | ||
end | end | ||
check_code_to_name_and_name_to_code_maps( | |||
"[[Module:families/data]]", | |||
all_codes, family_names, | |||
"families/code to canonical name", m_family_codes, | |||
"families/canonical names", m_family_canonical_names) | |||
end | end | ||
Line 619: | Line 861: | ||
local check_script_data_keys = check_data_keys({ | local check_script_data_keys = check_data_keys({ | ||
1, 2, -- canonical name, writing systems | |||
"canonicalName", "otherNames", "aliases", "varieties", "parent", | "canonicalName", "otherNames", "aliases", "varieties", "parent", | ||
" | "wikipedia_article", "characters", "spaces", "capitalized", "translit", "direction", | ||
"character_category", | "character_category", "normalizationFixes" | ||
}, true) | }, true) | ||
local m_script_codes = require('Module:scripts/code to canonical name') | local m_script_codes = require('Module:scripts/code to canonical name') | ||
local m_script_canonical_names = require('Module:scripts/by name') | local m_script_canonical_names = require('Module:scripts/by name') | ||
-- Just to satisfy requirements of check_code_to_name_and_name_to_code_maps. | |||
local script_code_to_module_map = {} | |||
for code, data in pairs(m_script_data) do | for code, data in pairs(m_script_data) do | ||
local canonical_name = data | local canonical_name = data[1] | ||
if not m_script_codes[code] and #code == 4 then | if not m_script_codes[code] and #code == 4 then | ||
discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name) | discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name) | ||
Line 666: | Line 912: | ||
validate_pattern(data.characters, modname, code, data, false) | validate_pattern(data.characters, modname, code, data, false) | ||
end | end | ||
script_code_to_module_map[code] = modname | |||
end | |||
check_code_to_name_and_name_to_code_maps( | |||
"a submodule of [[Module:scripts]]", | |||
script_code_to_module_map, script_names, | |||
"scripts/code to canonical name", m_script_codes, | |||
"scripts/by name", m_script_canonical_names) | |||
end | |||
local function check_labels() | |||
local check_label_data_keys = check_data_keys{ | |||
"display", "Wikipedia", "glossary", | |||
"plain_categories", "topical_categories", "pos_categories", "regional_categories", "sense_categories", | |||
"omit_preComma", "omit_postComma", "omit_preSpace", | |||
"deprecated", "track" | |||
} | |||
local function check_label(modname, code, data) | |||
local typ = type(data) | |||
if typ == "table" then | |||
check_label_data_keys(modname, code, data) | |||
elseif typ ~= "string" then | |||
discrepancy(modname, | |||
"The data for label <code>%s</code> is a %s; only tables and strings are allowed.", | |||
code, typ) | |||
end | |||
end | |||
for _, module in ipairs{"", "/regional", "/topical"} do | |||
local modname = "Module:labels/data" .. module | |||
module = require(modname) | |||
for label, data in pairs(module) do | |||
check_label(modname, label, data) | |||
end | |||
end | |||
for code in pairs(m_language_codes) do | |||
local modname = "Module:labels/data/lang/" .. code | |||
local ok, module = pcall(require, modname) | |||
if ok then | |||
for label, data in pairs(module) do | |||
check_label(modname, label, data) | |||
end | |||
end | |||
end | |||
end | |||
local function check_zh_trad_simp() | |||
local m_ts = mw.loadData("Module:zh/data/ts") | |||
local m_st = mw.loadData("Module:zh/data/st") | |||
local ruby = require("Module:ja-ruby").ruby_auto | |||
local lang = m_languages.getByCode("zh") | |||
local Hant = m_scripts.getByCode("Hant") | |||
local Hans = m_scripts.getByCode("Hans") | |||
local data = {[0] = m_st.st, m_ts.ts} | |||
local mod = {[0] = "st", "ts"} | |||
local var = {[0] = "Simp.", "Trad."} | |||
local sc = {[0] = Hans, Hant} | |||
local function find_stable_loop(chars, other, j) | |||
local display = ruby({["markup"] = "[" .. other .. "](" .. var[(j+1)%2] .. ")"}) | |||
display = m_links.language_link({term = other, alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}, false) | |||
table.insert(chars, display) | |||
if data[(j+1)%2][other] == other then | |||
table.insert(chars, other) | |||
return chars, 1 | |||
elseif not data[(j+1)%2][other] then | |||
table.insert(chars, "not found") | |||
return chars, 2 | |||
elseif data[j%2][data[(j+1)%2][other]] ~= other then | |||
return find_stable_loop(chars, data[(j+1)%2][other], j + 1) | |||
else | |||
local display = ruby({["markup"] = "[" .. data[(j+1)%2][other] .. "](" .. var[j%2] .. ")"}) | |||
display = m_links.language_link({term = data[(j+1)%2][other], alt = display, lang = lang, sc = sc[j%2], tr = "-"}, false) | |||
table.insert(chars, display .. " (") | |||
display = ruby({["markup"] = "[" .. data[j%2][data[(j+1)%2][other]] .. "](" .. var[(j+1)%2] .. ")"}) | |||
display = m_links.language_link({term = data[j%2][data[(j+1)%2][other]], alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}, false) | |||
table.insert(chars, display .. " etc.)") | |||
return chars, 3 | |||
end | |||
return chars, issue | |||
end | |||
for i = 0, 1, 1 do | |||
for char, other in pairs(data[i]) do | |||
if data[(i+1)%2][other] ~= char then | |||
local chars, issue = {} | |||
local display = ruby({["markup"] = "[" .. char .. "](" .. var[i] .. ")"}) | |||
display = m_links.language_link({term = char, alt = display, lang = lang, sc = sc[i], tr = "-"}, false) | |||
table.insert(chars, display) | |||
chars, issue = find_stable_loop(chars, other, i) | |||
if issue == 1 or issue == 2 then | |||
local sc_this, mod_this, j = {} | |||
if chars[#chars-1]:match(var[(i+1)%2]) then | |||
j = 1 | |||
else | |||
j = 0 | |||
end | |||
mod_this = mod[(i+j)%2] | |||
sc_this = {[0] = sc[(i+j)%2], sc[(i+j+1)%2]} | |||
for k, char in ipairs(chars) do | |||
chars[k] = m_script_utils.tag_text(char, lang, sc_this[k%2], "term") | |||
end | |||
if issue == 1 then | |||
discrepancy("zh/data/" .. mod_this, "character references itself: " .. table.concat(chars, " → ")) | |||
elseif issue == 2 then | |||
discrepancy("zh/data/" .. mod_this, "missing character: " .. table.concat(chars, " → ")) | |||
end | |||
elseif issue == 3 then | |||
for j, char in ipairs(chars) do | |||
chars[j] = m_script_utils.tag_text(char, lang, sc[(i+j)%2], "term") | |||
end | |||
discrepancy("zh/data/" .. mod[i], "possible mismatched character: " .. table.concat(chars, " → ")) | |||
end | |||
end | |||
end | |||
end | |||
end | |||
local function check_serialization(modname) | |||
local serializers = { | |||
["Hani-sortkey/data/serialized"] = "Hani-sortkey/serializer", | |||
["zh/data/ts/serialized"] = "zh/data/ts/serializer", | |||
["zh/data/st/serialized"] = "zh/data/st/serializer", | |||
} | |||
if not serializers[modname] then | |||
return nil | |||
end | |||
local serializer = serializers[modname] | |||
local current_data = require("Module:" .. serializer).main(true) | |||
local stored_data = require("Module:" .. modname) | |||
if current_data ~= stored_data then | |||
discrepancy(modname, "<strong><u>Important!</u> Serialized data is out of sync. Use [[Module: ".. serializer .. "]] to update it. If you have made any changes to the underlying data, the serialized data <u>must</u> be updated before these changes will take effect.</strong>") | |||
end | end | ||
end | end | ||
Line 671: | Line 1,056: | ||
-- Warning: cannot be called twice in the same module invocation because | -- Warning: cannot be called twice in the same module invocation because | ||
-- some module-global variables are not reset between calls. | -- some module-global variables are not reset between calls. | ||
function export.do_checks() | function export.do_checks(modules) | ||
messages = setmetatable({}, { | messages = setmetatable({}, { | ||
__index = function (self, k) | __index = function (self, k) | ||
Line 680: | Line 1,065: | ||
}) | }) | ||
if modules["zh/data/ts"] or modules["zh/data/st"] then | |||
check_zh_trad_simp() | |||
end | |||
check_languages() | check_languages() | ||
check_etym_languages() | check_etym_languages() | ||
Line 688: | Line 1,076: | ||
check_families() | check_families() | ||
check_scripts() | check_scripts() | ||
if modules["labels/data"] then | |||
check_labels() | |||
end | |||
for module in pairs(modules) do | |||
check_serialization(module) | |||
end | |||
setmetatable(messages, nil) | setmetatable(messages, nil) | ||
Line 716: | Line 1,112: | ||
function export.format_message(modname, msglist) | function export.format_message(modname, msglist) | ||
local header; if modname:match("^Module:") or modname:match("^Template:") then | |||
header = "===[[" .. modname .. "]]===" | |||
else | |||
header = "===[[Module:" .. modname .. "]]===" | |||
end | |||
return header | |||
.. msglist | .. msglist | ||
:map( | :map( | ||
Line 725: | Line 1,126: | ||
end | end | ||
function export.check_modules( | function export.check_modules(args) | ||
local modules = {} | |||
for i, arg in ipairs(args) do | |||
modules[arg] = true | |||
end | |||
local ret = Array() | local ret = Array() | ||
local messages = export.do_checks() | local messages = export.do_checks(modules) | ||
for _, module in ipairs | |||
for _, module in ipairs(args) do | |||
local msglist = messages[module] | local msglist = messages[module] | ||
if msglist then | if msglist then | ||
Line 739: | Line 1,147: | ||
function export.check_modules_t(frame) | function export.check_modules_t(frame) | ||
local args = m_table.shallowcopy(frame.args) | local args = m_table.shallowcopy(frame.args) | ||
return export.check_modules | return export.check_modules(args) | ||
end | end | ||
function export.perform(frame) | function export.perform(frame) | ||
local messages = export.do_checks() | local messages = export.do_checks({}) | ||
-- Format the messages | -- Format the messages |
Latest revision as of 18:15, 18 September 2023
- The following documentation is located at Module:Data consistency check/documentation. [edit]
- Useful links: subpage list • transclusions • testcases • sandbox
This module checks the validity and internal consistency of the language, language family, and script data used on Wiktionary: the modules in Category:Language data modules as well as Module:scripts/data.
Output
Lua error in package.lua at line 80: module 'Module:languages/data/3/i/extra' not found.
Checks performed
For multiple data modules:
- Codes for languages, families and etymology-only languages must be unique and cannot clash with one another.
- Canonical names for languages, families, and etymology-only languages must not be found in the list of other names.
- Each name in the list of other names must appear only once.
otherNames
, if present, must be an array.- Wikidata item IDs must be a positive integer or a string starting with
Q
and ending with decimal digits.
The following must be true of the data used by Module:languages:
- Each code must be defined in the correct submodule according to whether it is two-letter, three-letter or exceptional.
- The canonical name (field
1
) must be present and must not be the same as the canonical name of another language. - If field
2
is notnil
, it must a valid Wikidata item ID. - If field
3
orfamily
is given and notnil
, it must be a valid family code. - If field
4
orscripts
is given and notnil
, it must be an array, and each string in the array must be a valid script code. - If
ancestors
is given, it must be an array, and each string in the array must be a valid language or etymology language code. - If
family
is given, it must be a valid family code. - If
type
is given, it must be one of the recognised values (regular
,reconstructed
,appendix-constructed
). - If
entry_name
is given, it must be a table that contains either two arrays (from
andto
) or a string (remove_diacritics
) or both. - If
sort_key
is given, it may either be a string, or at table that in turn contains either two arrays (from
andto
) or a string (remove_diacritics
). - If
entry_name
orsort_key
is given, thefrom
array must be longer or equal in length to theto
array. - If
standardChars
is given, it must form a valid Lua string pattern when placed between square brackets with^
before it ("[^...]
). (It should match all characters regularly used in the language, but that cannot be tested.) - If
override_translit
is set,translit
must also be set, because there must be a transliteration module that can override manual transliteration. - If
link_tr
is present, it must betrue
. - Have no data keys besides these:
1, 2, 3, "entry_name", "sort_key", "display", "otherNames", "aliases", "varieties", "type", "scripts", "ancestors", "wikimedia_codes", "wikipedia_article", "standardChars", "translit", "override_translit", "link_tr"
.
Checks not performed:
- If
translit
is present, it should be the name of a module, and this module should contain atr
function that takes a pagename (and optionally a language code and script code) as arguments. - If
sort_key
is a string, it should be the name of a module, and this module should contain amakeSortKey
function that takes a pagename (and optionally a language code and script code) as arguments. - If
entry_name
orsort_key
is a table and contains a fieldremove_diacritics
, the value of the field should be a string that forms a valid Lua pattern when it is placed inside negated set notation ([^...]
).
These are not checked here, because module errors will quickly crop up in entries if these conditions are not met, assuming that Module:utilities attempts to generate a sortkey for a category pertaining to the language in question, or full_link
attempts to use the transliteration module.
Module:languages/code to canonical name and Module:languages/canonical names must contain all the codes and canonical names found in the data submodules of Module:languages, and no more.
The following must be true of the data used by Module:etymology languages:
canonicalName
must be given.parent
must be given must be a valid language, family or etymology-only language code.- If
ancestors
is given, it must be an array, and each string in the array must be a valid language or etymology language code. The etymology language should also be listed as the ancestor of a regular language. - Have no data keys besides these:
"canonicalName", "otherNames", "parent", "ancestors", "wikipedia_article", "wikidata_item"
.
Codes in Module:families data must:
- Have
canonicalName
, which must not be the same as the canonical name of another family. - If
family
is given, it must be a valid family code. - Have at least one language or subfamily belonging to it.
- Have no data keys besides these:
"canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item"
.
Codes in Module:scripts data must:
- Have
canonicalName
. - Have at least one language that lists it as one of its scripts.
- Have a
characters
pattern for script autodetection, and this must form a valid Lua string pattern when placed between square brackets ("[...]"
). (It should match all characters in the script, but that cannot be tested.) - Have no data keys besides these:
"canonicalName", "otherNames", "parent", "systems", "wikipedia_article", "characters", "direction"
.
local export = {}
local m_language_data = require("Module:languages/data/all")
local m_language_codes = require("Module:languages/code to canonical name")
local m_language_canonical_names = require("Module:languages/canonical names")
local m_etym_language_data = require("Module:etymology languages/data")
local m_etym_language_codes = require("Module:etymology languages/code to canonical name")
local m_etym_language_canonical_names = require("Module:etymology languages/canonical names")
local m_family_data = require("Module:families/data")
local m_family_codes = require("Module:families/code to canonical name")
local m_family_canonical_names = require("Module:families/canonical names")
local m_script_data = require("Module:scripts/data")
local m_languages = require("Module:languages")
local m_links = require("Module:links")
local m_scripts = require("Module:scripts")
local m_script_utils = require("Module:script utilities")
local m_table = require("Module:table")
local Array = require("Module:array")
local messages
local function discrepancy(modname, ...)
local ok, result = pcall(function(...) messages[modname]:insert(string.format(...)) end, ...)
if not ok then
mw.log(result, ...)
end
end
local all_codes = {}
local language_names = {}
local family_names = {}
local script_names = {}
local nonempty_families = {}
local allowed_empty_families = {tbq = true}
local nonempty_scripts = {}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguage$") then
return "[[:Category:" .. name .. "|" .. name .. "]]"
else
return "[[:Category:" .. name .. " language|" .. name .. " language]]"
end
end
local function link_script(name)
if not name then
return "???"
elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then
return "[[:Category:" .. name:gsub("^%l", string.upper) .. "|" .. name .. "]]"
else
return "[[:Category:" .. name .. " script|" .. name .. " script]]"
end
end
local function invalid_keys_message(modname, code, data, invalid_keys, is_script)
local plural = #invalid_keys ~= 1
discrepancy(modname, "The data key%s %s for %s (<code>%s</code>) %s invalid.",
plural and "s" or "",
invalid_keys
:map(
function(key)
return '<code>' .. key .. '</code>'
end)
:concat(", "),
(is_script and link_script or link)(data[1]),
code,
plural and "are" or "is")
end
local function check_data_keys(valid_keys, is_script)
valid_keys = Array(valid_keys):to_set()
return function (modname, code, data)
local invalid_keys
for k in pairs(data) do
if not valid_keys[k] then
invalid_keys = invalid_keys or Array()
invalid_keys:insert(k)
end
end
if invalid_keys then
invalid_keys_message(modname, code, data, invalid_keys, is_script)
end
end
end
-- Modification of isArray in [[Module:table]].
local function find_gap(t)
local i = 0
for _ in pairs(t) do
i = i + 1
if t[i] == nil then
return i
end
end
end
local function check_true_or_nil(modname, code, data, field_name)
if not (data[field_name] == nil or data[field_name] == true) then
discrepancy(modname,
"%s (<code>%s</code>) has an <code>%s</code> value that is not <code>nil</code> or <code>true</code>: %s",
link(canonical_name), code, field_name,
tostring(data.link_tr)
)
end
end
local function check_array(modname, code, data, array_name, subarray_name)
local subtable = data
if subarray_name then
subtable = assert(data[subarray_name], subarray_name)
end
local array_type = type(subtable[array_name])
if array_type == "table" then
local gap = find_gap(subtable[array_name])
if gap then
discrepancy(modname, "The %s array in %sthe data table for %s (<code>%s</code>) has a gap at index %d.",
array_name,
subarray_name and "the " .. subarray_name .. " field in " or "",
data[1],
code, gap)
else
return true
end
else
discrepancy(modname, "The %s field in %sthe data table for %s (<code>%s</code>) should be an array (table) but is %s.",
array_name,
subarray_name and "the " .. subarray_name .. " field in " or "",
data[1],
code,
array_type == "nil" and "nil" or "a " .. array_type)
end
end
local function check_wikidata_item(modname, code, data, key)
local data_item = data[key]
if data_item == nil then
return
elseif type(data_item) == "number" then
if not require "Module:table".isPositiveInteger(data_item) then
discrepancy(modname, "%g, the Wikidata item id for %s (<code>%s</code>), is not a positive integer or a string in the correct format.",
data_item, data[1], code)
end
elseif type(data_item) == "string" then
if not data_item:find "^Q%d+$" then
discrepancy(modname, "%s, the Wikidata item id for %s (<code>%s</code>), is not a string in the correct format or a positive integer.",
data_item, data[1], code)
end
end
end
local function check_other_names_or_aliases(modname, code, canonical_name, data, data_key, allow_nested)
local array = data[data_key]
if not array then
return
end
check_array(modname, code, data, data_key)
local names = {}
local function check_other_name(other_name)
if other_name == canonical_name then
discrepancy(modname,
"%s, the canonical name for <code>%s</code>, is repeated in the table of <code>%s</code>.",
canonical_name, code, data_key)
end
if names[other_name] then
discrepancy(modname,
"The name %s is found twice or more in the list of <code>%s</code> for %s (<code>%s</code>).",
other_name, data_key, canonical_name, code)
end
names[other_name] = true
end
for _, other_name in ipairs(array) do
if type(other_name) == "table" then
if not allow_nested then
discrepancy(modname,
"A nested table is found in the list of <code>%s</code> for %s (<code>%s</code>), but isn't allowed.",
data_key, canonical_name, code)
else
for _, on in ipairs(other_name) do
check_other_name(on)
end
end
else
check_other_name(other_name)
end
end
end
local function check_other_names_aliases_varieties(modname, code, canonical_name, data)
if data.otherNames then
check_other_names_or_aliases(modname, code, canonical_name, data, "otherNames")
end
if data.aliases then
check_other_names_or_aliases(modname, code, canonical_name, data, "aliases")
end
if data.varieties then
check_other_names_or_aliases(modname, code, canonical_name, data, "varieties", true)
end
end
local get_codepoint = mw.ustring.codepoint
local function validate_pattern(pattern, modname, code, data, standardChars)
if type(pattern) ~= "string" then
discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.',
pattern, standardChars and 'standard character ' or '', code, data[1])
end
local ranges
for lower, higher in mw.ustring.gmatch(pattern, "(.)%-%%?(.)") do
if get_codepoint(lower) >= get_codepoint(higher) then
ranges = ranges or Array()
table.insert(ranges, { lower, higher })
end
end
if ranges and ranges[1] then
local plural = #ranges ~= 1 and "s" or ""
discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern ' ..
'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' ..
'in the range%s %s %s must be less than the second.',
link(data[1]), code, standardChars and 'standard ' or '', pattern, plural, plural,
ranges
:map(
function(range)
return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)")
:format(get_codepoint(range[1]), get_codepoint(range[2]))
end)
:concat(", "),
#ranges ~= 1 and "are" or "is")
end
if not pcall(mw.ustring.find, "", "[" .. pattern .. "]") then
discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern for ' ..
(standardChars and 'standard' or '') .. ' character detection: <code>"%s"</code>',
link(data.canonical_name), code, pattern)
end
end
local remove_exceptions_addition = 0xF0000
local maximum_code_point = 0x10FFFF
local remove_exceptions_maximum_code_point = maximum_code_point - remove_exceptions_addition
local function check_entry_name_or_sortkey(modname, code, data, replacements_name)
local canonical_name = data[1]
local replacements = data[replacements_name]
if type(replacements) == "string" then
if not (replacements_name == "sort_key" or replacements_name == "entry_name") then
discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.",
replacements_name, canonical_name, code)
end
return
end
if (replacements.from ~= nil) ~= (replacements.to ~= nil) then
discrepancy(modname,
"The <code>from</code> and <code>to</code> arrays in the <code>%s</code> table for %s (<code>%s</code>) are not both defined or both undefined.",
replacements_name, canonical_name, code)
elseif replacements.from then
for _, key in ipairs { "from", "to" } do
check_array(modname, code, data, key, replacements_name)
end
end
if replacements.remove_diacritics and type(replacements.remove_diacritics) ~= "string" then
discrepancy(modname,
"The <code>remove_diacritics</code> field in the <code>%s</code> table for %s (<code>%s</code>) table must be a string.",
replacements_name, canonical_name, code)
end
if replacements.remove_exceptions then
if check_array(modname, code, data, "remove_exceptions", replacements_name) then
for sequence_i, sequence in ipairs(replacements.remove_exceptions) do
local code_point_i = 0
for code_point in mw.ustring.gcodepoint(sequence) do
code_point_i = code_point_i + 1
if code_point > remove_exceptions_maximum_code_point then
discrepancy(modname,
"Code point #%d (0x%04X) in field #%d of the <code>remove_exceptions</code> array for %s (<code>%s</code>) is over U+%04X.",
code_point_i, code_point, sequence_i, canonical_name, code, remove_exceptions_maximum_code_point)
end
end
end
end
end
if replacements.from and replacements.to
and m_table.length(replacements.to) > m_table.length(replacements.from) then
discrepancy(modname,
"The <code>from</code> array in the <code>%s</code> table for %s (<code>%s</code>) must be shorter or the same length as the <code>to</code> array.",
replacements_name, canonical_name, code)
end
end
local function has_regular_language_child(parent_code)
for code, data in pairs(m_language_data) do
local ancestors = data.ancestors
if ancestors then
for _, ancestor in pairs(ancestors) do
if ancestor == parent_code then
return true
end
end
end
end
return false
end
local function check_ancestors(modname, code, data, ancestors, is_etymology_language)
if type(ancestors) == "string" then ancestors = mw.text.split(ancestors, "%s*,%s*") end
local canonical_name = data[1]
for _, ancestor_code in ipairs(ancestors) do
if not (m_language_data[ancestor_code] or m_etym_language_data[ancestor_code]) then
discrepancy(modname,
"%s (<code>%s</code>) lists an invalid language code <code>%s</code> as ancestor.",
link(canonical_name), code, ancestor_code)
end
end
end
local function check_code_to_name_and_name_to_code_maps(
source_module_description,
code_to_module_map, name_to_code_map,
code_to_name_modname, code_to_name_module,
name_to_code_modname, name_to_code_module)
local function check_code_and_name(modname, code, canonical_name)
if not code_to_module_map[code] then
if not name_to_code_map[canonical_name] then
discrepancy(modname,
"The code <code>%s</code> and the canonical name %s should be removed; they are not found in %s.",
code, canonical_name, source_module_description)
else
discrepancy(modname,
"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.",
code, canonical_name, name_to_code_map[canonical_name])
end
elseif not name_to_code_map[canonical_name] then
local data_table = require("Module:" .. code_to_module_map[code])[code]
discrepancy(modname,
"%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.",
canonical_name, code, data_table[1])
end
end
for code, canonical_name in pairs(code_to_name_module) do
check_code_and_name(code_to_name_modname, code, canonical_name)
end
for canonical_name, code in pairs(name_to_code_module) do
check_code_and_name(name_to_code_modname, code, canonical_name)
end
end
local function check_extraneous_extra_data(
data_modname, data_module, extra_data_modname, extra_data_module)
for code, _ in pairs(extra_data_module) do
if not data_module[code] then
discrepancy(extra_data_modname,
"Language code <code>%s</code> is not found in [[Module:%s]], and should be removed from [[Module:%s]].",
code, data_modname, extra_data_modname
)
end
end
end
-- Just trying to not have a module error when someone puts a script code
-- in the position of a language code.
local function show_family_code(code)
if type(code) == "string" then
return "<code>" .. code .. "</code>"
else
return require("Module:debug").highlight_dump(code)
end
end
local function check_languages()
local check_language_data_keys = check_data_keys{
1, 2, 3, 4, -- canonical name, wikidata item, family, scripts
"display_text", "generate_forms", "entry_name", "sort_key",
"otherNames", "aliases", "varieties",
"type", "ancestors",
"wikimedia_codes", "wikipedia_article", "standardChars",
"translit", "override_translit", "link_tr",
"dotted_dotless_i"
}
local function check_language(modname, code, data, mainData, extraData)
local canonical_name, lang_type = data[1], data.type
check_language_data_keys(modname, code, data)
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
if not m_language_codes[code] then
discrepancy("languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
end
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif language_names[canonical_name] then
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(canonical_name), code, language_names[canonical_name])
else
if not m_language_canonical_names[canonical_name] then
discrepancy("languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
end
language_names[canonical_name] = code
end
check_wikidata_item(modname, code, data, 2)
if extraData then
check_other_names_aliases_varieties(modname, code, canonical_name, extraData)
end
if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then
discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(canonical_name), code, data.type)
end
if mainData.aliases then
discrepancy(modname, "%s (<code>%s</code>) has the <code>aliases</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code)
end
if mainData.varieties then
discrepancy(modname, "%s (<code>%s</code>) has the <code>varieties</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code)
end
if mainData.otherNames then
discrepancy(modname, "%s (<code>%s</code>) has the <code>otherNames</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(canonical_name), code)
end
if not extraData then
discrepancy(modname .. "/extra", "%s (<code>%s</code>) has data in [[Module:" .. modname .. "]], but does not have corresponding data in [[Module:" .. modname .. "/extra]].", link(canonical_name), code)
--elseif extraData.otherNames then
-- discrepancy(modname .. "/extra", "%s (<code>%s</code>) has <code>otherNames</code> key, but these should be changed to either <code>aliases</code> or <code>varieties</code>.", link(canonical_name), code)
end
local sc = data[4]
if sc then
if type(sc) == "string" then
sc = mw.text.split(sc, "%s*,%s*")
end
if type(sc) == "table" then
if not sc[1] then
discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(canonical_name), code)
else
for _, sccode in ipairs(sc) do
local cur_sc = m_script_data[sccode]
if not cur_sc and sccode ~= "All" then
discrepancy(modname,
"%s (<code>%s</code>) lists an invalid script code <code>%s</code>.",
link(canonical_name), code, sccode)
-- elseif not cur_sc.characters then
-- discrepancy(modname,
-- "%s (<code>%s</code>) lists a script without characters <code>%s</code> (%s).",
-- link(canonical_name), code, sccode, cur_sc[1])
end
nonempty_scripts[sccode] = true
end
end
else
discrepancy(modname,
"The %s field for %s (<code>%s</code>) must be a table or string.",
4, link(canonical_name), code)
end
end
if data.ancestors then
check_ancestors(modname, code, data, data.ancestors, false)
end
if data[3] then
local family = data[3]
if not m_family_data[family] then
discrepancy(modname,
"%s (<code>%s</code>) has an invalid family code %s.",
link(canonical_name), code, show_family_code(family))
end
nonempty_families[family] = true
end
if data.sort_key then
check_entry_name_or_sortkey(modname, code, data, "sort_key")
end
if data.entry_name then
check_entry_name_or_sortkey(modname, code, data, "entry_name")
end
if data.display then
check_entry_name_or_sortkey(modname, code, data, "display")
end
if data.standardChars then
if type(data.standardChars) == "table" then
local sccodes = {}
for _, sccode in ipairs(sc) do
sccodes[sccode] = true
end
for sccode in pairs(data.standardChars) do
if not (sccodes[sccode] or sccode == 1) then
discrepancy(modname, "The field %s in the standardChars table for %s (<code>%s</code>) does not match any script for that language.",
sccode, canonical_name, code)
end
end
elseif data.standardChars and type(data.standardChars) ~= "string" then
discrepancy(modname, "The standardChars field in the data table for %s (<code>%s</code>) must be a string or table.",
canonical_name, code)
end
end
check_true_or_nil(modname, code, data, "override_translit")
check_true_or_nil(modname, code, data, "link_tr")
if data.override_translit and not data.translit then
discrepancy(modname,
"%s (<code>%s</code>) has <code>override_translit</code> set, but no transliteration module",
link(canonical_name), code)
end
end
-- Check two-letter codes
local modname = "languages/data/2"
local data2 = mw.loadData("Module:" .. modname)
local extra_modname = modname .. "/extra"
local extradata2 = mw.loadData("Module:" .. extra_modname)
for code, data in pairs(data2) do
if not code:find("^[a-z][a-z]$") then
discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data[1]), code)
end
check_language(modname, code, data, data2[code], extradata2[code])
end
check_extraneous_extra_data(modname, data2, extra_modname, extradata2)
-- Check three-letter codes
for i = string.byte('a'), string.byte('z') do
local letter = string.char(i)
local modname = "languages/data/3/" .. letter
local data3 = mw.loadData("Module:" .. modname)
local extra_modname = modname .. "/extra"
local extradata3 = mw.loadData("Module:" .. extra_modname)
local code_pattern = "^" .. letter .. "[a-z][a-z]$"
for code, data in pairs(data3) do
if not code:find(code_pattern) then
discrepancy(modname,
'%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".',
link(data[1]), code, letter)
end
check_language(modname, code, data, data3[code], extradata3[code])
end
check_extraneous_extra_data(modname, data3, extra_modname, extradata3)
end
-- Check exceptional codes
modname = "languages/data/exceptional"
local datax = mw.loadData("Module:" .. modname)
extra_modname = modname .. "/extra"
local extradatax = mw.loadData("Module:" .. extra_modname)
for code, data in pairs(datax) do
if code:find("^[a-z][a-z][a-z]?$") then
discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data[1]), code)
end
check_language(modname, code, data, datax[code], extradatax[code])
end
check_extraneous_extra_data(modname, datax, extra_modname, extradatax)
-- These checks must be done while all_codes only contains language codes:
-- that is, after language data modules have been processed, but before
-- etymology languages, families, and scripts have.
check_code_to_name_and_name_to_code_maps(
"a submodule of [[Module:languages]]",
all_codes, language_names,
"languages/code to canonical name", m_language_codes,
"languages/canonical names", m_language_canonical_names)
-- Check [[Template:langname-lite]]
local frame = mw.getCurrentFrame()
local content = mw.title.new("Template:langname-lite"):getContent()
content = content:gsub("%<%!%-%-.-%-%-%>", "") -- remove comments
local match = mw.ustring.gmatch(content, "\n\t*|#*([^\n]+)=([^\n]*)")
while true do
local code, name, check_name = match()
if not code then return "OK" end
if code:len() > 1 and code ~= "default" then
for _, code in pairs(mw.text.split(code, "|")) do
local lang = m_languages.getByCode(code, nil, true, true)
if name:match("etymcode") then
local nonEtym_name = frame:preprocess(name)
local nonEtym_real_name = lang:getNonEtymologicalName()
if nonEtym_name ~= nonEtym_real_name then
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. nonEtym_name .. ". Expected name: " .. nonEtym_real_name .. ".")
end
name = frame:preprocess(name:gsub("{{{allow etym|}}}", "1"))
elseif name:match("familycode") then
name = name:match("familycode|(.-)|")
else
name = name
end
if not lang then
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Language not present in data.")
else
local real_name = lang:getCanonicalName()
if name ~= real_name then
discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Expected name: " .. real_name .. ".")
end
end
end
end
end
end
local function check_etym_languages()
local modname = "etymology languages/data"
local check_etymology_language_data_keys = check_data_keys{
1, 2, 3, 4, 5, -- canonical name, wikidata item, family, scripts, parent
"display_text", "generate_forms", "entry_name", "sort_key",
"otherNames", "aliases", "varieties",
"type", "main_code", "ancestors",
"wikimedia_codes", "wikipedia_article", "standardChars",
"translit", "override_translit", "link_tr",
"dotted_dotless_i"
}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguage$") then
return name
else
return name .. " language"
end
end
for code, data in pairs(m_etym_language_data) do
local canonical_name, parent =
data[1], data[5]
check_etymology_language_data_keys(modname, code, data)
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
if not m_etym_language_codes[code] then
discrepancy("etymology languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
end
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif language_names[canonical_name] then
local m_canonical_lang = m_languages.getByCanonicalName(canonical_name, nil, true)
if not m_canonical_lang then
discrepancy(modname, "%s (<code>%s</code>) has a canonical name that cannot be looked up.",
link(canonical_name), code)
elseif data.main_code ~= m_canonical_lang:getCode() then
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(canonical_name), code, language_names[canonical_name])
end
else
if not m_etym_language_canonical_names[canonical_name] then
discrepancy("etymology languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
end
language_names[canonical_name] = code
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if parent then
if type(parent) ~= "string" then
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has a parent language or family code that is %s rather than a string.",
link(canonical_name), code, parent == nil and "nil" or "a " .. type(parent))
elseif not (m_language_data[parent] or m_family_data[parent] or m_etym_language_data[parent]) then
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.",
link(canonical_name), code, parent)
end
nonempty_families[parent] = true
else
discrepancy(modname,
"Etymology-only %s (<code>%s</code>) has no parent language or family code.",
link(canonical_name), code)
end
if data.ancestors then
check_ancestors(modname, code, data, data.ancestors, false)
end
if data[3] then
local family = data[3]
if not m_family_data[family] then
discrepancy(modname,
"%s (<code>%s</code>) has an invalid family code %s.",
link(canonical_name), code, show_family_code(family))
end
nonempty_families[family] = true
end
check_wikidata_item(modname, code, data, 2)
end
local checked = {}
for code, data in pairs(m_etym_language_data) do
local stack = {}
while data do
if checked[data] then
break
end
if stack[data] then
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
link(data[1]), code,
link(m_etym_language_data[data[5]][1]), data.parent or data[5]
)
break
end
stack[data] = true
code, data = data[5], data[5] and m_etym_language_data[data[5]]
end
for data in pairs(stack) do
checked[data] = true
end
end
check_code_to_name_and_name_to_code_maps(
"[[Module:etymology languages/data]]",
all_codes, language_names,
"etymology languages/code to canonical name", m_etym_language_codes,
"etymology languages/canonical names", m_etym_language_canonical_names)
end
local function check_families()
local modname = "families/data"
local check_family_data_keys = check_data_keys{
1, 2, 3, -- canonical name, wikidata item, (parent) family
"type",
"protoLanguage", "otherNames", "aliases", "varieties",
}
local function link(name)
if not name then
return "???"
elseif name:find("[Ll]anguages$") then
return "[[:Category:" .. name .. "|" .. name .. " family]]"
else
return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
end
end
for code, data in pairs(m_family_data) do
check_family_data_keys(modname, code, data)
local canonical_name, family = data[1], data[3]
if all_codes[code] then
discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
else
if not m_family_codes[code] then
discrepancy("families/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
end
all_codes[code] = modname
end
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif family_names[canonical_name] then
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link(canonical_name), code, family_names[canonical_name])
else
if not m_family_canonical_names[canonical_name] then
discrepancy("families/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
end
family_names[canonical_name] = code
end
if data[2] and type(data[2]) ~= "number" then
discrepancy(modname, "%s (<code>%s</code>) has a wikidata item value that is not a number or <code>nil</code>: %s", link(canonical_name), code, mw.dumpObject(data[2]))
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if family then
if family == code and code ~= "qfa-not" then
discrepancy(modname,
"%s (<code>%s</code>) has itself as its family.",
link(canonical_name), code)
elseif not m_family_data[family] then
discrepancy(modname,
"%s (<code>%s</code>) has an invalid parent family code %s.",
link(canonical_name), code, show_family_code(family))
end
nonempty_families[family] = true
end
check_wikidata_item(modname, code, data, 2)
end
for code, data in pairs(m_family_data) do
if not (nonempty_families[code] or allowed_empty_families[code]) then
discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data[1]), code)
end
end
local checked = { ['qfa-not'] = true }
for code, data in pairs(m_family_data) do
local stack = {}
while data do
if checked[code] then
break
end
if stack[code] then
discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
link(data[1]), code,
link(m_family_data[data[3]][1]), data[3]
)
break
end
stack[code] = true
code, data = data[3], m_family_data[data[3]]
end
for code in pairs(stack) do
checked[code] = true
end
end
check_code_to_name_and_name_to_code_maps(
"[[Module:families/data]]",
all_codes, family_names,
"families/code to canonical name", m_family_codes,
"families/canonical names", m_family_canonical_names)
end
local function check_scripts()
local modname = "scripts/data"
local check_script_data_keys = check_data_keys({
1, 2, -- canonical name, writing systems
"canonicalName", "otherNames", "aliases", "varieties", "parent",
"wikipedia_article", "characters", "spaces", "capitalized", "translit", "direction",
"character_category", "normalizationFixes"
}, true)
local m_script_codes = require('Module:scripts/code to canonical name')
local m_script_canonical_names = require('Module:scripts/by name')
-- Just to satisfy requirements of check_code_to_name_and_name_to_code_maps.
local script_code_to_module_map = {}
for code, data in pairs(m_script_data) do
local canonical_name = data[1]
if not m_script_codes[code] and #code == 4 then
discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name)
end
check_script_data_keys(modname, code, data)
if not canonical_name then
discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
elseif script_names[canonical_name] then
--[=[
discrepancy(modname,
"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
link_script(data.names[1]), code, script_names[data.names[1]])
--]=]
else
if not m_script_canonical_names[canonical_name] and #code == 4 then
discrepancy('scripts/by name', '%s (<code>%s</code>) is missing', canonical_name, code)
end
script_names[canonical_name] = code
end
check_other_names_aliases_varieties(modname, code, canonical_name, data)
if not nonempty_scripts[code] then
discrepancy(modname,
"%s (<code>%s</code>) is not used by any language%s.",
link_script(canonical_name), code, data.characters and ""
or " and has no characters listed for auto-detection")
--[[
elseif not data.characters then
discrepancy(modname, "%s (<code>%s</code>) has no characters listed for auto-detection.", link_script(canonical_name), code)
--]]
end
if data.characters then
validate_pattern(data.characters, modname, code, data, false)
end
script_code_to_module_map[code] = modname
end
check_code_to_name_and_name_to_code_maps(
"a submodule of [[Module:scripts]]",
script_code_to_module_map, script_names,
"scripts/code to canonical name", m_script_codes,
"scripts/by name", m_script_canonical_names)
end
local function check_labels()
local check_label_data_keys = check_data_keys{
"display", "Wikipedia", "glossary",
"plain_categories", "topical_categories", "pos_categories", "regional_categories", "sense_categories",
"omit_preComma", "omit_postComma", "omit_preSpace",
"deprecated", "track"
}
local function check_label(modname, code, data)
local typ = type(data)
if typ == "table" then
check_label_data_keys(modname, code, data)
elseif typ ~= "string" then
discrepancy(modname,
"The data for label <code>%s</code> is a %s; only tables and strings are allowed.",
code, typ)
end
end
for _, module in ipairs{"", "/regional", "/topical"} do
local modname = "Module:labels/data" .. module
module = require(modname)
for label, data in pairs(module) do
check_label(modname, label, data)
end
end
for code in pairs(m_language_codes) do
local modname = "Module:labels/data/lang/" .. code
local ok, module = pcall(require, modname)
if ok then
for label, data in pairs(module) do
check_label(modname, label, data)
end
end
end
end
local function check_zh_trad_simp()
local m_ts = mw.loadData("Module:zh/data/ts")
local m_st = mw.loadData("Module:zh/data/st")
local ruby = require("Module:ja-ruby").ruby_auto
local lang = m_languages.getByCode("zh")
local Hant = m_scripts.getByCode("Hant")
local Hans = m_scripts.getByCode("Hans")
local data = {[0] = m_st.st, m_ts.ts}
local mod = {[0] = "st", "ts"}
local var = {[0] = "Simp.", "Trad."}
local sc = {[0] = Hans, Hant}
local function find_stable_loop(chars, other, j)
local display = ruby({["markup"] = "[" .. other .. "](" .. var[(j+1)%2] .. ")"})
display = m_links.language_link({term = other, alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}, false)
table.insert(chars, display)
if data[(j+1)%2][other] == other then
table.insert(chars, other)
return chars, 1
elseif not data[(j+1)%2][other] then
table.insert(chars, "not found")
return chars, 2
elseif data[j%2][data[(j+1)%2][other]] ~= other then
return find_stable_loop(chars, data[(j+1)%2][other], j + 1)
else
local display = ruby({["markup"] = "[" .. data[(j+1)%2][other] .. "](" .. var[j%2] .. ")"})
display = m_links.language_link({term = data[(j+1)%2][other], alt = display, lang = lang, sc = sc[j%2], tr = "-"}, false)
table.insert(chars, display .. " (")
display = ruby({["markup"] = "[" .. data[j%2][data[(j+1)%2][other]] .. "](" .. var[(j+1)%2] .. ")"})
display = m_links.language_link({term = data[j%2][data[(j+1)%2][other]], alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}, false)
table.insert(chars, display .. " etc.)")
return chars, 3
end
return chars, issue
end
for i = 0, 1, 1 do
for char, other in pairs(data[i]) do
if data[(i+1)%2][other] ~= char then
local chars, issue = {}
local display = ruby({["markup"] = "[" .. char .. "](" .. var[i] .. ")"})
display = m_links.language_link({term = char, alt = display, lang = lang, sc = sc[i], tr = "-"}, false)
table.insert(chars, display)
chars, issue = find_stable_loop(chars, other, i)
if issue == 1 or issue == 2 then
local sc_this, mod_this, j = {}
if chars[#chars-1]:match(var[(i+1)%2]) then
j = 1
else
j = 0
end
mod_this = mod[(i+j)%2]
sc_this = {[0] = sc[(i+j)%2], sc[(i+j+1)%2]}
for k, char in ipairs(chars) do
chars[k] = m_script_utils.tag_text(char, lang, sc_this[k%2], "term")
end
if issue == 1 then
discrepancy("zh/data/" .. mod_this, "character references itself: " .. table.concat(chars, " → "))
elseif issue == 2 then
discrepancy("zh/data/" .. mod_this, "missing character: " .. table.concat(chars, " → "))
end
elseif issue == 3 then
for j, char in ipairs(chars) do
chars[j] = m_script_utils.tag_text(char, lang, sc[(i+j)%2], "term")
end
discrepancy("zh/data/" .. mod[i], "possible mismatched character: " .. table.concat(chars, " → "))
end
end
end
end
end
local function check_serialization(modname)
local serializers = {
["Hani-sortkey/data/serialized"] = "Hani-sortkey/serializer",
["zh/data/ts/serialized"] = "zh/data/ts/serializer",
["zh/data/st/serialized"] = "zh/data/st/serializer",
}
if not serializers[modname] then
return nil
end
local serializer = serializers[modname]
local current_data = require("Module:" .. serializer).main(true)
local stored_data = require("Module:" .. modname)
if current_data ~= stored_data then
discrepancy(modname, "<strong><u>Important!</u> Serialized data is out of sync. Use [[Module: ".. serializer .. "]] to update it. If you have made any changes to the underlying data, the serialized data <u>must</u> be updated before these changes will take effect.</strong>")
end
end
-- Warning: cannot be called twice in the same module invocation because
-- some module-global variables are not reset between calls.
function export.do_checks(modules)
messages = setmetatable({}, {
__index = function (self, k)
local val = Array()
self[k] = val
return val
end
})
if modules["zh/data/ts"] or modules["zh/data/st"] then
check_zh_trad_simp()
end
check_languages()
check_etym_languages()
-- families and scripts must be checked AFTER languages; languages checks fill out
-- the nonempty_families and nonempty_scripts tables, used for testing if a family/script
-- is ever used in the data
check_families()
check_scripts()
if modules["labels/data"] then
check_labels()
end
for module in pairs(modules) do
check_serialization(module)
end
setmetatable(messages, nil)
local function find_code(message)
return string.match(message, "<code>([^<]+)</code>")
end
find_code = require("Module:fun").memoize(find_code)
local function comp(message1, message2)
local code1, code2 = find_code(message1), find_code(message2)
if code1 and code2 then
return code1 < code2
else
return message1 < message2
end
end
for modname, msglist in pairs(messages) do
msglist:sort(comp)
end
local ret = messages
messages = nil
return ret
end
function export.format_message(modname, msglist)
local header; if modname:match("^Module:") or modname:match("^Template:") then
header = "===[[" .. modname .. "]]==="
else
header = "===[[Module:" .. modname .. "]]==="
end
return header
.. msglist
:map(
function(msg)
return "\n* " .. msg
end)
:concat()
end
function export.check_modules(args)
local modules = {}
for i, arg in ipairs(args) do
modules[arg] = true
end
local ret = Array()
local messages = export.do_checks(modules)
for _, module in ipairs(args) do
local msglist = messages[module]
if msglist then
ret:insert(export.format_message(module, msglist))
end
end
return ret:concat("\n")
end
function export.check_modules_t(frame)
local args = m_table.shallowcopy(frame.args)
return export.check_modules(args)
end
function export.perform(frame)
local messages = export.do_checks({})
-- Format the messages
local ret = Array()
for modname, msglist in m_table.sortedPairs(messages) do
ret:insert(export.format_message(modname, msglist))
end
-- Are there any messages?
if i == 1 then
return '<b class="success">Glory to Arstotzka.</b>'
else
ret:insert(1, '<b class="warning">Discrepancies detected:</b>')
return ret:concat('\n')
end
end
return export