Module:User:Erutuon/script recognition

From The Languages of David J. Peterson
Jump to navigation Jump to search

This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.

Lua error at line 421: attempt to index local 'overrider' (a nil value).


local export = {}

local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char
local floor = math.floor

local title = mw.title.getCurrentTitle().fullText

local function check(funcName, expectType)
	return function(argIndex, arg)
		require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
	end
end

function mw.logf(...)
	return mw.log(string.format(...))
end

local output_mt = {}
function output_mt:insert(str)
	self.n = self.n + 1
	self[self.n] = str
end

function output_mt:insert_format(...)
	self:insert(string.format(...))
end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output()
	return setmetatable({ n = 0 }, output_mt)
end

local function dump(val)
	local output = Output()
	
	output:insert('{\n')
	local range_format =
[[
		{ 0x%05X, 0x%05X, "%s" },
]]
	local length_format = -- also close range array
[[
		length = %d,
	},
]]
	for i = 0, 0x10FFFF / 0x100 do
		local ranges = val[i]
		if ranges then
			output:insert_format(
[[
	[0x%02X] = {
]], i)
			for j, range in ipairs(ranges) do
				output:insert_format(range_format, unpack(range))
			end
			output:insert_format(length_format, ranges.length or -1)
		end
	end
	
	output:insert
[[

	individual = {
]]
	
	for codepoint, script in require "Module:table".sortedPairs(val.individual) do
		output:insert_format(
[[
		[0x%05X] = "%s",
]],		codepoint, script)
	end
	output:insert [[
	},

	blocks = {
]]
	
	for _, blockRange in ipairs(val.blocks) do
		output:insert_format(
[[
		{ 0x%02X, 0x%02X, "%s" },
]],		unpack(blockRange))
	end
	
	output:insert
[[
	},
}]]
	
	return require "Module:debug".highlight(table.concat(output))
end

local function printRanges(ranges)
	local output = Output()
	output:insert("Ranges:")
	for _, range in ipairs(ranges) do
		output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range))
	end
	mw.log(output:join())
end

local function hasContents(t)
	if next(t) then
		return true
	else
		return false
	end
end

local function log(message)
	if title:match("testcases/documentation$") then
		mw.log(message)
	end
end

local function makeRangeKey(codepoint)
	return floor(codepoint / 0x1000)
end

local function isInRange(value, lower, upper)
	-- mw.log(value, lower, upper)
	local check = check("isInRange", "number")
	check(1, value)
	check(2, lower)
	check(3, upper)
	
	return value >= lower and value <= upper
end

local function lookupCharacter(characterLookup, character)
	local codepoint
	if type(character) == "string" then
		if mw.ustring.len(character) == 1 then
			codepoint = getCodepoint(character)
		else
			error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
		end
	elseif type(character) == "number" then
		codepoint = character
	else
		error("Character is the wrong type: " .. type(character) .. ".")
	end
	
	if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
		return false
	elseif characterLookup.values and characterLookup.values[codepoint] then
		return true
	else
		for i, range in ipairs(characterLookup) do
			if isInRange(codepoint, range[1], range[2]) then
				return true
			end
		end
	end
	
	return false
end

local function forEachChar(str, func)
	if type(func) == "function" then
		for i = 1, mw.ustring.len(str) do
			char = mw.ustring.sub(str, i, i)
			func(char)
		end
	end
end

function export.makeCharacterLookup(pattern)
	local characterLookup = {}
	local values = {}
	local allValues = {}
	
	local i = 1
	-- Create ranges in which all characters belong to the script.
	local workingString = mw.ustring.gsub(
		pattern,
		"([^-])%-([^-])",
		function(item1, item2)
			local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
			--[[
			if not (codepoint1 < codepoint2) then
				error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
			end
			]]
			table.insert(characterLookup, { codepoint1, codepoint2 })
			allValues[codepoint1] = true
			allValues[codepoint2] = true
			return ""
		end
	)
	if workingString ~= "" then
		workingString = mw.ustring.gsub(
			workingString,
			".",
			function(char)
				local codepoint = getCodepoint(char)
				values[codepoint] = true
				allValues[codepoint] = true
			end
		)
	end
	
	--[[
		Place the tables of ranges in the Unicode order (the patterns
		should already be in that order, but just to be safe).
	]]
	table.sort(
		characterLookup,
		function(item1, item2)
			return item1[1] < item2[1]
		end)
	
	local allValuesKeys = require("Module:table").numKeys(allValues)
	
	local smallest, largest = allValuesKeys[1], allValuesKeys[#allValuesKeys]
	
	-- Don't create an empty values table.
	if hasContents(values) then
		characterLookup.values = values
	end
	
	--[[
		Don't record the smallest and largest values if they're found in the
		first range.
	]]
	if not (smallest == characterLookup[1][1] and largest == characterLookup[1][2]) then
		characterLookup.smallest, characterLookup.largest = smallest, largest
	end
	
	return characterLookup
end

function export.makeAllScriptsCharacterLookup()
	local allScriptsCharacterLookup = {}
	local patternToScript = {}
	for code, data in pairs(require("Module:scripts/data")) do
		if data.characters then
			-- Don't generate identical lookup table twice.
			local scriptWithPattern = patternToScript[data.characters]
			if scriptWithPattern then
				allScriptsCharacterLookup[code] = allScriptsCharacterLookup[scriptWithPattern]
			else
				allScriptsCharacterLookup[code] = export.makeCharacterLookup(data.characters)
			end
			patternToScript[data.characters] = code
		end
	end
	return allScriptsCharacterLookup
end

-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
	return scriptCode:gsub("^([^-]+)%-(.+)$", "%2-%1")
end

-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
	polytonic = "Grek2",
	Latinx = "Latnx",
	Latf = "Latnf",
}

local function modifyAdHocCode(code)
	if scriptCodeReplacements[code] then
		return scriptCodeReplacements[code]
	elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then
		return code:gsub("^(.+)$", "~%1")
	else
		return code
	end
end
	
local function keySort(key1, key2)
	local type1, type2 = type(key1), type(key2)
	if type1 == "number" and type2 == "string" then
		return true
	elseif type1 == "string" and type2 == "number" then
		return false
	elseif type1 == "string" then
		key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
		key1, key2 = switchLangSc(key1), switchLangSc(key2)
		local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
		return lower1 < lower2
	else
		return key1 < key2
	end
end

local function hex(number)
	return string.format("0x%X", number)
end

local function divideRange(lower, upper, width, testing)
	local ranges = {}
	
	if not (lower and upper) then
		mw.log("divideRange failed:", lower, upper, width, testing)
		return nil
	end
	
	local position = floor(lower / width)
	local start = position * width
	
	local i = 0
	local increment = i * width
	repeat
		local range1 = start + increment
		local range2 = range1 + width - 1
		
		if range1 < lower then
			range1 = lower
		end
		
		if range2 > upper then
			range2 = upper
		end
		
		if testing then
			range1, range2 = hex(range1), hex(range2)
		end
		
		ranges[position + i] = { range1, range2 }
		
		i = i + 1
		increment = i * width
	until
		 start + increment > upper
	
	return ranges
end

function export.showDividedRange(frame)
	local lower = 0x2A700
	local higher = 0x2B73F
	local width = 0x1000
	local dividedRange = divideRange(lower, higher, width, true)
	return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end

-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
	["Latf"]		= true;
	["Hans"]		= true;
	["Hant"]		= true;
	["Kore"]		= true;
	["Jpan"]		= true;
	["fa-Arab"] 	= true;
	["kk-Arab"] 	= true;
	["ks-Arab"] 	= true;
	["ku-Arab"]		= true;
	["ms-Arab"]		= true;
	["mzn-Arab"]	= true;
	["ota-Arab"]	= true;
	["pa-Arab"]		= true;
	["ps-Arab"]		= true;
	["sd-Arab"]		= true;
	["tt-Arab"]		= true;
	["ug-Arab"]		= true;
	["ur-Arab"]		= true;
	["nv-Latn"]		= true;
	["pjt-Latn"]	= true;
	["Zyyy"]		= true;
}

local function sortRange(range1, range2)
	local number1, number2 = tonumber(range1[1]), tonumber(range2[1])
	if number1 == number2 then
		return keySort(range1[3], range2[3])
	else
		return number1 < number2
	end
end

local function printScriptRange(range, hideScriptName)
	if hideScriptName then
		return ("U+%04X-U+%04X"):format(range[1], range[2])
	else
		return ("%s (U+%04X-U+%04X)"):format(range[3], range[1], range[2])
	end
end
	
-- When there is overlap between ranges belonging to two different scripts,
-- the key in this table overrides the value.
local overrides = {
	Beng = "as-Beng",
	Cyrl = "Cyrs",
	Grek = "polytonic",
	Latn = "Latinx",
}

local function fixRangeOverlaps(ranges)
	local prev
	local i = 1
	while ranges[i] do
		range = ranges[i]
		prev = ranges[i - 1]
		if prev and (range[1] <= prev[1] or range[2] <= prev[2]) then
			-- mw.logf("%s in conflict with %s",
				-- printScriptRange(prev), printScriptRange(range))
			local overrider, overridden
			if overrides[range[3]] == prev[3] then
				overrider, overridden = range, prev
			elseif overrides[prev[3]] == range[3] then
				overrider, overridden = prev, range
			end
			
			if overrider and overridden then
				mw.logf("%s overrides %s", printScriptRange(overrider),
					printScriptRange(overridden))
			else
				mw.logf("Should %s override %s or the other way around?",
					printScriptRange(prev), printScriptRange(range))
			end
			
			if overrider[1] <= overridden[1] then -- low end of overridden is inside overrider
				if overridden[2] <= overrider[2]  then -- overridden entirely within overrider
					table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
					if overridden == prev then
						i = i - 1
					end
				else -- upper part of overridden outside of overrider
					if overridden[2] - overrider[2] == 1 then -- one codepoint of overridden is outside overrider
						table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
						if overridden == prev then
							i = i - 1
						end
						individual[overridden[2]] = overridden[3]
					else
						overridden[1] = overrider[2] + 1
					end
				end
			else -- overridden[1] < overrider[1]: low end of overridden is outside overrider
				-- single codepoint at low end of overridden is outside overrider
				table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
				if overridden == prev then
					i = i - 1
				end
				
				if overrider[1] - overridden[1] == 1 then
					individual[overridden[1]] = overridden[3]
				else -- multiple codepoints at low end of overridden are outside overrider
					ranges:insert(i,
						{ overridden[1], overrider[1] - 1, overridden[3] })
					i = i + 1
				end
				
				if overrider[2] < overridden[2] then -- high end of overridden is outside overrider
					-- single codepoint at high end of overridden is outside of overrider
					if overridden[2] - overrider[2] == 1 then
						individual[overridden[2]] = overridden[3]
					else
						ranges:insert(i,
							{ overrider[2] + 1, overridden[2], overridden[3] })
						i = i + 1
					end
				end
			end
		end
		i = i + 1
	end
end

local function checkRangeOverlaps(ranges)
	local prev
	for i, range in ipairs(ranges) do
		if prev and prev[2] >= range[1] then
			mw.logf("%s overlaps with %s",
				printScriptRange(prev), printScriptRange(range))
		end
		prev = range
	end
end

local function makeCodepointToScriptLookup(testing)
	local output = {}
	local ranges_mt = {
		insert = function (self, i, value)
			if value ~= nil then
				if self[i][1] < value[1] then
					i = i + 1
				end
				mw.logf("Inserting %s below %s",
					printScriptRange(value), printScriptRange(self[i]))
				table.insert(self, i, value)
			else
				value = i
				table.insert(self, value)
			end
		end,
		remove = table.remove,
	}
	ranges_mt.__index = ranges_mt
	
	setmetatable(output,
		{
			__index = function (self, key)
				local val = setmetatable({}, ranges_mt)
				self[key] = val
				return val
			end,
		})
	output.individual = {}
	local individual = output.individual
	local rangeStrings = {}
	
	local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
	for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
		if not scriptBlacklist[scriptCode] then
			for key, value in ipairs(lookup) do
				if type(value) == "table" then
					local newRanges = divideRange(value[1], value[2], 0x1000, testing)
					if newRanges then
						for position, newRange in pairs(newRanges) do
							local rangeString = newRange[1] .. newRange[2]
							if rangeStrings[rangeString] then
								mw.logf("The range U+%04X-U+%04X is already "
									.. "recorded as belonging to the script "
									.. "code %s.",
									newRange[1], newRange[2], rangeStrings[rangeString])
							else
								rangeStrings[rangeString] = scriptCode
								
								output[position]:insert({ newRange[1], newRange[2], scriptCode })
							end
						end
					end
				end
			end
			
			if lookup.values then
				for codepoint in pairs(lookup.values) do
					if individual[codepoint] then
						mw.logf("The codepoint %s is already recorded as " ..
							"belonging to the script code %s.",
							hex(codepoint), individual[codepoint])
					else
						individual[codepoint] = scriptCode
					end
				end
			end
		end
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			local prevRange
			local i = 1
			while ranges[i] do
				range = ranges[i]
				if prevRange and range[3] == prevRange[3] and prevRange[2] == range[1] - 1 then
					mw.logf("Merged %s with %s",
						printScriptRange(range), printScriptRange(prevRange))
					prevRange[2] = range[2]
					table.remove(ranges, i)
					i = i - 1 -- to compensate for removed element
				end
				prevRange = range
				i = i + 1
			end
			table.sort(ranges, sortRange)
		end
	end
	
	local individualCodepoints = require "Module:table".numKeys(individual)
	local minimumCodepointRange = 3
	local i = 1
	while individualCodepoints[i] do
		local codepoint = individualCodepoints[i]
		local script = individual[codepoint]
		if not script then
			error(("No script for U+%04X"):format(codepoint))
		end
		
		local startOfRun = codepoint
		while individual[codepoint + 1] == script do
			codepoint = codepoint + 1
			i = i + 1
		end
		
		if codepoint - startOfRun + 1 >= minimumCodepointRange
			and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then
			for j = startOfRun, codepoint do
				individual[j] = nil
			end
			
			local rangeKey = makeRangeKey(startOfRun)
			local ranges = output[rangeKey]
			if not ranges then
				ranges = {}
				output[rangeKey] = ranges
			end
			ranges:insert({ startOfRun, codepoint, script })
			mw.logf("Added range %s from a run in individual map",
				printScriptRange { startOfRun, codepoint, script })
			table.sort(ranges, sortRange)
		end
		
		i = i + 1
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			if ranges[2] then
				fixRangeOverlaps(ranges)
			end
		end
	end
	
	-- Add length field to range arrays and check that there are no overlaps
	-- between ranges.
	output.blocks = {}
	local prevScript, blockRange
	for index, ranges in pairs(output) do
		if type(index) == "number" then
			ranges.length = #ranges
			if ranges[2] then
				checkRangeOverlaps(ranges)
			end
			local firstScript = ranges[1][3]
			if not ranges[2] or require "Module:fun".all(
					function (range)
						return range[3] == firstScript
					end,
					ranges) then -- All ranges contain the same script.
				if prevScript and firstScript == prevScript then
					if not blockRange then
						blockRange = { index - 1, index, prevScript }
						table.insert(output.blocks, blockRange)
					else
						blockRange[2] = index
					end
				else
					blockRange = nil
					prevScript = firstScript
				end
			else
				prevScript = nil
			end
		end
	end
	
	setmetatable(output, nil)
	
	return output
end

--[[
	Binary search: more efficient for the longer lists of codepoint ranges than
	for the shorter ones.
]]
local function binarySearch(ranges, value)
	--	Initialize numbers.
	local iStart, iMid = 1, 0
	-- Can't use # because table is loaded by mw.loadData.
	local iEnd = require("Module:table").size(ranges)
	
	if iEnd == 0 then
		return nil
	end
	
	local iterations = 0
	
	-- Do search.
	while iStart <= iEnd do
		iterations = iterations + 1
		
		-- Calculate middle.
		iMid = floor((iStart + iEnd) / 2)
		
		-- Get compare value.
		local range = ranges[iMid]
		
		-- Return matching index. Assumes there are no duplicates.
		if isInRange(value, range[1], range[2]) then
			return range
		
		-- Keep searching.
		elseif value < range[1] then
			iEnd = iMid - 1
		else
			iStart = iMid + 1
		end
	end
	return nil
end

local function lookupInOrder(number, ranges)
	for i, range in ipairs(ranges) do
		if isInRange(number, range[1], range[2]) then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return range[3]
		end
		if number < range[1] then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return nil
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

function export.charToScript(char)
	local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
	local codepoint = mw.ustring.codepoint(char)
	
	local individualMatch = lookup.individual[codepoint]
	if individualMatch then
		return individualMatch
	else
		local script = lookupInOrder(codepoint, rangesCache)
		if script then
			return script
		end
		
		local index = makeRangeKey(codepoint)
		
		script = lookupInOrder(index, lookup.blocks)
		if script then
			return script
		end
		
		local range = binarySearch(lookup[index], codepoint)
		if range then
			table.insert(rangesCache, range)
			table.sort(rangesCache, sortRange)
			return range[3]
		end
	end
	
	return nil
end

function export.show(frame)
	local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
	
	local str = frame.args[1] or "ABCD一丨丶丿乙亅"
	
	local result = {}
	forEachChar(
		str,
		function(char)
			table.insert(result, tostring(export.charToScript(char)))
		end
	)

	return str .. ": " .. table.concat(result, ", ")
end

function export.show(frame)
	return dump(makeCodepointToScriptLookup())
end

return export