Module:IPA/X-SAMPA

From The Languages of David J. Peterson
Jump to navigation Jump to search

This module contains functions related to X-SAMPA. It was split from Module:IPA to save memory because X-SAMPA is not used directly in entries, but only when saving a page (as {{x2i}}, {{x2ipa}}, {{x2ipachar}} are always substituted).


local export = {}

local m_XSAMPA = mw.loadData('Module:IPA/data/X-SAMPA')

-- IPA <-> XSAMPA lookup tables
local i2x_lookup = {}
local function Populate_IPA_XSAMPA_LookupTables()
	if #i2x_lookup == 0 then
		for XSAMPA_symbol, data in pairs(m_XSAMPA) do
			local IPA_symbol = data[1]
			i2x_lookup[IPA_symbol] = XSAMPA_symbol
			
			local with_descender = data.with_descender
			if with_descender then
				i2x_lookup[with_descender] = XSAMPA_symbol
			end
		end
	end
	return i2x_lookup
end


function export.IPA_to_XSAMPA(text)
	Populate_IPA_XSAMPA_LookupTables()
	
	local escape = false
	if type(text) == 'table' then -- a frame, extract args
		text = text.args[1]
		text = text:gsub('{{=}}','='):gsub('{{!}}','|')
		text = mw.text.decode(text) -- XXX
		escape = true
	end

	text = text:gsub('ːː', ':') -- this basically sums up m_symbols[2].XSAMPA
	text = mw.ustring.gsub(text, '.', i2x_lookup)

	if escape then
		text = mw.text.nowiki(text)
	end
	return text
end

function export.XSAMPA_to_IPA(text)
	local data = m_XSAMPA
	
	local escape = false
	if type(text) == 'table' then -- a frame, extract args
		text = text.args[1]
		text = mw.text.decode(text) -- XXX
		escape = true
	end
	
	-- Simpler function adapted from [[w:Module:Sandbox/Erutuon/X-SAMPA]]
	local output, characteristics  = {}, {}
	local angle_bracket
	if text:sub(1, 1) == "<" and text:sub(-1) == ">" then
		table.insert(output, "⟨")
		angle_bracket =  "⟩"
		text = text:sub(2, -2)
	end
	
	local escaped = {}
	local emoticon = 0x1F600 - 1
	
	local function escape(text, pattern)
		emoticon = emoticon + 1
		return text:gsub(
			pattern,
			function(match)
				local emoticon = mw.ustring.char(emoticon)
				escaped[emoticon] = match
				return emoticon
			end)
	end
	
	--[[
		Replace
		-- HTML tags
		-- character entity references (&nbsp;)
		-- numeric character references (&#32; &#x20;)
		with characters from Emoticon block.
	--]]
	
	text = escape(text, '<[^>]+>')
	text = escape(text, '&%a+;')
	text = escape(text, '&#%d+;')
	text = escape(text, '&#x%x+;')
	
	while #text > 0 do
		 -- skip non-ASCII bytes (that is, multi-byte characters)
		text = text:gsub(
			'^[\128-\255]+',
			function (nonASCII)
				table.insert(output, nonASCII)
				return ""
			end)
		
		for i = 4, 1, -1 do
			local potential_XSAMPA = text:sub(1, i)
			local result = data[potential_XSAMPA]
			local IPA, with_descender, has_descender, is_diacritic
			
			if result then
				IPA = result[1]
				with_descender = result.with_descender
				has_descender = result.has_descender
				is_diacritic = result.is_diacritic
				if with_descender then
					-- Go backwords through the transcription, skipping any diacritics.
					local j = 0
					while characteristics[#characteristics - j].is_diacritic do
						j = j + 1
					end
					--[[	Look at the first non-diacritic symbol before the current symbol.
							If it has a descender, use the descender form of the current symbol. ]]
					if characteristics[#characteristics - j].has_descender then
						IPA = with_descender
					end
				end
			elseif i == 1 then
				IPA = potential_XSAMPA
			end
			
			if IPA then
				text = text:sub(i + 1)
				table.insert(output, IPA)
				table.insert(characteristics, { has_descender = has_descender, is_diacritic = is_diacritic } )
				break
			end
		end
	end
	table.insert(output, angle_bracket)
	
	output = table.concat(output)
	
	output = output:gsub("[\194-\244][\128-\191]+", escaped)
	
	if escape then
--		output = mw.text.nowiki(output)
	end

	return output
end

return export