Module:User:Erutuon/script recognition
Jump to navigation
Jump to search
- The following documentation is located at Module:User:Erutuon/script recognition/documentation. [edit]
- Useful links: root page • root page's subpages • transclusions • testcases • user page • user talk page • userspace
This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.
Lua error at line 421: attempt to index local 'overrider' (a nil value).
local export = {}
local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char
local floor = math.floor
local title = mw.title.getCurrentTitle().fullText
local function check(funcName, expectType)
return function(argIndex, arg)
require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
end
end
function mw.logf(...)
return mw.log(string.format(...))
end
local output_mt = {}
function output_mt:insert(str)
self.n = self.n + 1
self[self.n] = str
end
function output_mt:insert_format(...)
self:insert(string.format(...))
end
output_mt.join = table.concat
output_mt.__index = output_mt
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
local function dump(val)
local output = Output()
output:insert('{\n')
local range_format =
[[
{ 0x%05X, 0x%05X, "%s" },
]]
local length_format = -- also close range array
[[
length = %d,
},
]]
for i = 0, 0x10FFFF / 0x100 do
local ranges = val[i]
if ranges then
output:insert_format(
[[
[0x%02X] = {
]], i)
for j, range in ipairs(ranges) do
output:insert_format(range_format, unpack(range))
end
output:insert_format(length_format, ranges.length or -1)
end
end
output:insert
[[
individual = {
]]
for codepoint, script in require "Module:table".sortedPairs(val.individual) do
output:insert_format(
[[
[0x%05X] = "%s",
]], codepoint, script)
end
output:insert [[
},
blocks = {
]]
for _, blockRange in ipairs(val.blocks) do
output:insert_format(
[[
{ 0x%02X, 0x%02X, "%s" },
]], unpack(blockRange))
end
output:insert
[[
},
}]]
return require "Module:debug".highlight(table.concat(output))
end
local function printRanges(ranges)
local output = Output()
output:insert("Ranges:")
for _, range in ipairs(ranges) do
output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range))
end
mw.log(output:join())
end
local function hasContents(t)
if next(t) then
return true
else
return false
end
end
local function log(message)
if title:match("testcases/documentation$") then
mw.log(message)
end
end
local function makeRangeKey(codepoint)
return floor(codepoint / 0x1000)
end
local function isInRange(value, lower, upper)
-- mw.log(value, lower, upper)
local check = check("isInRange", "number")
check(1, value)
check(2, lower)
check(3, upper)
return value >= lower and value <= upper
end
local function lookupCharacter(characterLookup, character)
local codepoint
if type(character) == "string" then
if mw.ustring.len(character) == 1 then
codepoint = getCodepoint(character)
else
error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
end
elseif type(character) == "number" then
codepoint = character
else
error("Character is the wrong type: " .. type(character) .. ".")
end
if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
return false
elseif characterLookup.values and characterLookup.values[codepoint] then
return true
else
for i, range in ipairs(characterLookup) do
if isInRange(codepoint, range[1], range[2]) then
return true
end
end
end
return false
end
local function forEachChar(str, func)
if type(func) == "function" then
for i = 1, mw.ustring.len(str) do
char = mw.ustring.sub(str, i, i)
func(char)
end
end
end
function export.makeCharacterLookup(pattern)
local characterLookup = {}
local values = {}
local allValues = {}
local i = 1
-- Create ranges in which all characters belong to the script.
local workingString = mw.ustring.gsub(
pattern,
"([^-])%-([^-])",
function(item1, item2)
local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
--[[
if not (codepoint1 < codepoint2) then
error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
end
]]
table.insert(characterLookup, { codepoint1, codepoint2 })
allValues[codepoint1] = true
allValues[codepoint2] = true
return ""
end
)
if workingString ~= "" then
workingString = mw.ustring.gsub(
workingString,
".",
function(char)
local codepoint = getCodepoint(char)
values[codepoint] = true
allValues[codepoint] = true
end
)
end
--[[
Place the tables of ranges in the Unicode order (the patterns
should already be in that order, but just to be safe).
]]
table.sort(
characterLookup,
function(item1, item2)
return item1[1] < item2[1]
end)
local allValuesKeys = require("Module:table").numKeys(allValues)
local smallest, largest = allValuesKeys[1], allValuesKeys[#allValuesKeys]
-- Don't create an empty values table.
if hasContents(values) then
characterLookup.values = values
end
--[[
Don't record the smallest and largest values if they're found in the
first range.
]]
if not (smallest == characterLookup[1][1] and largest == characterLookup[1][2]) then
characterLookup.smallest, characterLookup.largest = smallest, largest
end
return characterLookup
end
function export.makeAllScriptsCharacterLookup()
local allScriptsCharacterLookup = {}
local patternToScript = {}
for code, data in pairs(require("Module:scripts/data")) do
if data.characters then
-- Don't generate identical lookup table twice.
local scriptWithPattern = patternToScript[data.characters]
if scriptWithPattern then
allScriptsCharacterLookup[code] = allScriptsCharacterLookup[scriptWithPattern]
else
allScriptsCharacterLookup[code] = export.makeCharacterLookup(data.characters)
end
patternToScript[data.characters] = code
end
end
return allScriptsCharacterLookup
end
-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
return scriptCode:gsub("^([^-]+)%-(.+)$", "%2-%1")
end
-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
polytonic = "Grek2",
Latinx = "Latnx",
Latf = "Latnf",
}
local function modifyAdHocCode(code)
if scriptCodeReplacements[code] then
return scriptCodeReplacements[code]
elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then
return code:gsub("^(.+)$", "~%1")
else
return code
end
end
local function keySort(key1, key2)
local type1, type2 = type(key1), type(key2)
if type1 == "number" and type2 == "string" then
return true
elseif type1 == "string" and type2 == "number" then
return false
elseif type1 == "string" then
key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
key1, key2 = switchLangSc(key1), switchLangSc(key2)
local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
return lower1 < lower2
else
return key1 < key2
end
end
local function hex(number)
return string.format("0x%X", number)
end
local function divideRange(lower, upper, width, testing)
local ranges = {}
if not (lower and upper) then
mw.log("divideRange failed:", lower, upper, width, testing)
return nil
end
local position = floor(lower / width)
local start = position * width
local i = 0
local increment = i * width
repeat
local range1 = start + increment
local range2 = range1 + width - 1
if range1 < lower then
range1 = lower
end
if range2 > upper then
range2 = upper
end
if testing then
range1, range2 = hex(range1), hex(range2)
end
ranges[position + i] = { range1, range2 }
i = i + 1
increment = i * width
until
start + increment > upper
return ranges
end
function export.showDividedRange(frame)
local lower = 0x2A700
local higher = 0x2B73F
local width = 0x1000
local dividedRange = divideRange(lower, higher, width, true)
return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end
-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
["Latf"] = true;
["Hans"] = true;
["Hant"] = true;
["Kore"] = true;
["Jpan"] = true;
["fa-Arab"] = true;
["kk-Arab"] = true;
["ks-Arab"] = true;
["ku-Arab"] = true;
["ms-Arab"] = true;
["mzn-Arab"] = true;
["ota-Arab"] = true;
["pa-Arab"] = true;
["ps-Arab"] = true;
["sd-Arab"] = true;
["tt-Arab"] = true;
["ug-Arab"] = true;
["ur-Arab"] = true;
["nv-Latn"] = true;
["pjt-Latn"] = true;
["Zyyy"] = true;
}
local function sortRange(range1, range2)
local number1, number2 = tonumber(range1[1]), tonumber(range2[1])
if number1 == number2 then
return keySort(range1[3], range2[3])
else
return number1 < number2
end
end
local function printScriptRange(range, hideScriptName)
if hideScriptName then
return ("U+%04X-U+%04X"):format(range[1], range[2])
else
return ("%s (U+%04X-U+%04X)"):format(range[3], range[1], range[2])
end
end
-- When there is overlap between ranges belonging to two different scripts,
-- the key in this table overrides the value.
local overrides = {
Beng = "as-Beng",
Cyrl = "Cyrs",
Grek = "polytonic",
Latn = "Latinx",
}
local function fixRangeOverlaps(ranges)
local prev
local i = 1
while ranges[i] do
range = ranges[i]
prev = ranges[i - 1]
if prev and (range[1] <= prev[1] or range[2] <= prev[2]) then
-- mw.logf("%s in conflict with %s",
-- printScriptRange(prev), printScriptRange(range))
local overrider, overridden
if overrides[range[3]] == prev[3] then
overrider, overridden = range, prev
elseif overrides[prev[3]] == range[3] then
overrider, overridden = prev, range
end
if overrider and overridden then
mw.logf("%s overrides %s", printScriptRange(overrider),
printScriptRange(overridden))
else
mw.logf("Should %s override %s or the other way around?",
printScriptRange(prev), printScriptRange(range))
end
if overrider[1] <= overridden[1] then -- low end of overridden is inside overrider
if overridden[2] <= overrider[2] then -- overridden entirely within overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
else -- upper part of overridden outside of overrider
if overridden[2] - overrider[2] == 1 then -- one codepoint of overridden is outside overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
individual[overridden[2]] = overridden[3]
else
overridden[1] = overrider[2] + 1
end
end
else -- overridden[1] < overrider[1]: low end of overridden is outside overrider
-- single codepoint at low end of overridden is outside overrider
table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
if overridden == prev then
i = i - 1
end
if overrider[1] - overridden[1] == 1 then
individual[overridden[1]] = overridden[3]
else -- multiple codepoints at low end of overridden are outside overrider
ranges:insert(i,
{ overridden[1], overrider[1] - 1, overridden[3] })
i = i + 1
end
if overrider[2] < overridden[2] then -- high end of overridden is outside overrider
-- single codepoint at high end of overridden is outside of overrider
if overridden[2] - overrider[2] == 1 then
individual[overridden[2]] = overridden[3]
else
ranges:insert(i,
{ overrider[2] + 1, overridden[2], overridden[3] })
i = i + 1
end
end
end
end
i = i + 1
end
end
local function checkRangeOverlaps(ranges)
local prev
for i, range in ipairs(ranges) do
if prev and prev[2] >= range[1] then
mw.logf("%s overlaps with %s",
printScriptRange(prev), printScriptRange(range))
end
prev = range
end
end
local function makeCodepointToScriptLookup(testing)
local output = {}
local ranges_mt = {
insert = function (self, i, value)
if value ~= nil then
if self[i][1] < value[1] then
i = i + 1
end
mw.logf("Inserting %s below %s",
printScriptRange(value), printScriptRange(self[i]))
table.insert(self, i, value)
else
value = i
table.insert(self, value)
end
end,
remove = table.remove,
}
ranges_mt.__index = ranges_mt
setmetatable(output,
{
__index = function (self, key)
local val = setmetatable({}, ranges_mt)
self[key] = val
return val
end,
})
output.individual = {}
local individual = output.individual
local rangeStrings = {}
local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
if not scriptBlacklist[scriptCode] then
for key, value in ipairs(lookup) do
if type(value) == "table" then
local newRanges = divideRange(value[1], value[2], 0x1000, testing)
if newRanges then
for position, newRange in pairs(newRanges) do
local rangeString = newRange[1] .. newRange[2]
if rangeStrings[rangeString] then
mw.logf("The range U+%04X-U+%04X is already "
.. "recorded as belonging to the script "
.. "code %s.",
newRange[1], newRange[2], rangeStrings[rangeString])
else
rangeStrings[rangeString] = scriptCode
output[position]:insert({ newRange[1], newRange[2], scriptCode })
end
end
end
end
end
if lookup.values then
for codepoint in pairs(lookup.values) do
if individual[codepoint] then
mw.logf("The codepoint %s is already recorded as " ..
"belonging to the script code %s.",
hex(codepoint), individual[codepoint])
else
individual[codepoint] = scriptCode
end
end
end
end
end
for position, ranges in pairs(output) do
if type(position) == "number" then
local prevRange
local i = 1
while ranges[i] do
range = ranges[i]
if prevRange and range[3] == prevRange[3] and prevRange[2] == range[1] - 1 then
mw.logf("Merged %s with %s",
printScriptRange(range), printScriptRange(prevRange))
prevRange[2] = range[2]
table.remove(ranges, i)
i = i - 1 -- to compensate for removed element
end
prevRange = range
i = i + 1
end
table.sort(ranges, sortRange)
end
end
local individualCodepoints = require "Module:table".numKeys(individual)
local minimumCodepointRange = 3
local i = 1
while individualCodepoints[i] do
local codepoint = individualCodepoints[i]
local script = individual[codepoint]
if not script then
error(("No script for U+%04X"):format(codepoint))
end
local startOfRun = codepoint
while individual[codepoint + 1] == script do
codepoint = codepoint + 1
i = i + 1
end
if codepoint - startOfRun + 1 >= minimumCodepointRange
and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then
for j = startOfRun, codepoint do
individual[j] = nil
end
local rangeKey = makeRangeKey(startOfRun)
local ranges = output[rangeKey]
if not ranges then
ranges = {}
output[rangeKey] = ranges
end
ranges:insert({ startOfRun, codepoint, script })
mw.logf("Added range %s from a run in individual map",
printScriptRange { startOfRun, codepoint, script })
table.sort(ranges, sortRange)
end
i = i + 1
end
for position, ranges in pairs(output) do
if type(position) == "number" then
if ranges[2] then
fixRangeOverlaps(ranges)
end
end
end
-- Add length field to range arrays and check that there are no overlaps
-- between ranges.
output.blocks = {}
local prevScript, blockRange
for index, ranges in pairs(output) do
if type(index) == "number" then
ranges.length = #ranges
if ranges[2] then
checkRangeOverlaps(ranges)
end
local firstScript = ranges[1][3]
if not ranges[2] or require "Module:fun".all(
function (range)
return range[3] == firstScript
end,
ranges) then -- All ranges contain the same script.
if prevScript and firstScript == prevScript then
if not blockRange then
blockRange = { index - 1, index, prevScript }
table.insert(output.blocks, blockRange)
else
blockRange[2] = index
end
else
blockRange = nil
prevScript = firstScript
end
else
prevScript = nil
end
end
end
setmetatable(output, nil)
return output
end
--[[
Binary search: more efficient for the longer lists of codepoint ranges than
for the shorter ones.
]]
local function binarySearch(ranges, value)
-- Initialize numbers.
local iStart, iMid = 1, 0
-- Can't use # because table is loaded by mw.loadData.
local iEnd = require("Module:table").size(ranges)
if iEnd == 0 then
return nil
end
local iterations = 0
-- Do search.
while iStart <= iEnd do
iterations = iterations + 1
-- Calculate middle.
iMid = floor((iStart + iEnd) / 2)
-- Get compare value.
local range = ranges[iMid]
-- Return matching index. Assumes there are no duplicates.
if isInRange(value, range[1], range[2]) then
return range
-- Keep searching.
elseif value < range[1] then
iEnd = iMid - 1
else
iStart = iMid + 1
end
end
return nil
end
local function lookupInOrder(number, ranges)
for i, range in ipairs(ranges) do
if isInRange(number, range[1], range[2]) then
-- mw.log(mw.ustring.char(number), hex(number), i)
return range[3]
end
if number < range[1] then
-- mw.log(mw.ustring.char(number), hex(number), i)
return nil
end
end
end
-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}
function export.charToScript(char)
local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
local codepoint = mw.ustring.codepoint(char)
local individualMatch = lookup.individual[codepoint]
if individualMatch then
return individualMatch
else
local script = lookupInOrder(codepoint, rangesCache)
if script then
return script
end
local index = makeRangeKey(codepoint)
script = lookupInOrder(index, lookup.blocks)
if script then
return script
end
local range = binarySearch(lookup[index], codepoint)
if range then
table.insert(rangesCache, range)
table.sort(rangesCache, sortRange)
return range[3]
end
end
return nil
end
function export.show(frame)
local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
local str = frame.args[1] or "ABCD一丨丶丿乙亅"
local result = {}
forEachChar(
str,
function(char)
table.insert(result, tostring(export.charToScript(char)))
end
)
return str .. ": " .. table.concat(result, ", ")
end
function export.show(frame)
return dump(makeCodepointToScriptLookup())
end
return export