Module:Hani

From Wiktionary, the free dictionary
Jump to navigation Jump to search

local concat = table.concat
local explode = require("Module:string utilities").explode_utf8
local find = string.find
local insert = table.insert
local match = string.match
local pcall = pcall
local umatch = mw.ustring.match

local export = {}

do
	local ids = mw.loadData("Module:Hani/data").ids
	
	local function find_end_of_ids(text, i, components)
		local component, j, success = 0, i
		repeat
			component = component + 1
			j = j + 1
			local char = text[j]
			-- If it's the end of the string or a space, fail the whole sequence and backtrack.
			if not char or umatch(char, "%s") then
				-- Throw an error object containing the end index and the expected number of remaining characters.
				error{_end = j - 1, _expected = components - component + 1}
			end
			local new_components = ids[char]
			if new_components then
				success, j = pcall(find_end_of_ids, text, j, new_components)
				if not success then
					-- Add any additional expected characters.
					if j._expected then
						j._expected = j._expected + components - component
					end
					error(j)
				end
			end
		until component == components
		return j
	end
	
	-- Explodes a string of characters into an array, taking into account any ideographic description characters (IDS). By default, it throws an error if invalid IDS is found. If `fallback` is set, the invalid sequence is split into the largest possible components (e.g. "⿲⿸AB⿱CD" would be split into "⿲", "⿸AB" and "⿱CD", while "⿰⿱AB⿰C" would be split into "⿰", "⿱AB", "⿰" and "C"); this is useful for sortkey contexts, as invalid sequences may occur in arbitrary input.
	function export.explode_chars(text, fallback)
		if not (match(text, "\226\191[\176-\191]") or find(text, "〾") or find(text, "㇯")) then
			return explode(text)
		end
		text = explode(text)
		local ret, text_len, i = {}, #text, 0
		repeat
			i = i + 1
			local char = text[i]
			local components = ids[char]
			if components then
				local success, j = pcall(find_end_of_ids, text, i, components)
				if success then
					char = concat(text, nil, i, j)
					i = j
				elseif not j._expected then -- Any other errors (e.g. stack overflows) will be strings.
					error(j)
				else
					j = "Invalid IDS sequence: \"" .. concat(text, nil, i, j._end) ..
						"\": expected " .. j._expected .. " additional character" ..
						(j._expected == 1 and "" or "s") .. "."
					if not fallback then
						error(j)
					end
					mw.log(j)
					require("Module:debug/track")("Hani/invalid ids")
				end
			end
			insert(ret, char)
		until i == text_len
		return ret
	end
end

-- Converts any iteration marks (々 and 〻) into the relevant characters in the text, where n repeated iteration marks repeats n previous characters (e.g. "時々" = "時時", "馬鹿々々しい" = "馬鹿馬鹿しい" etc). Punctuation and unconnected sets of iteraton marks block iteration, with excess marks being left as-is. For example, "X,Y々々" = "X,YY々", and "X々Y々々" = "XXYY々" (not "XXYXY").
function export.convert_iteration_marks(text)
	if not match(text, "\227\128[\133\187]") then
		return text
	end
	text = explode(text)
	-- Work backwards, since sets of iteration marks must be isolated from each other (e.g. "X々Y々々" should be "XXYY々", with one excess at the end, not "XXYXY").
	local i, n = #text, 0
	while i > 0 do
		local char = text[i]
		if char == "々" or char == "〻" then
			n = n + 1
		elseif n > 0 then
			-- Count backwards once for each iteration mark, but stop early if we find something which can't be iterated, as that marks the start of the set to be repeated.
			local anchor = i
			for j = 0, n - 1 do
				local prev = text[anchor - j]
				if not prev or prev == "々" or prev == "〻" or umatch(prev, "%W") then
					n = j
					break
				end
			end
			if n > 0 then
				i = i - n + 1
				-- Replace iteration marks ahead with the relevant character.
				for j = i, i + n - 1 do
					text[j + n] = text[j]
				end
				n = 0
			end
		end
		i = i - 1
	end
	return concat(text)
end

return export