Jump to content

Module:Sandbox/AbstractWikipedia/Lexemes

From Meta, a Wikimedia project coordination wiki
Module documentation

This is the lexemes module of the Abstract Wikipedia template-renderer prototype.

It defines the internal lexeme datatype and various methods to manipulate it. This type is based on the lexeme type discussed in the Abstract Wikipedia template proposal.

The handling of the lexemes' features is done through the UnifiableFeatures module.


local p = {}

local uf = require("Module:Sandbox/AbstractWikipedia/UnifiableFeatures")
local gf = require("Module:Sandbox/AbstractWikipedia/GrammaticalFeatures")


-- create a list of features from given table of category-feature pairs
local function featureList ( new_features )
	local features = {}
	local features_api = {}
	
	-- Verify that category hasn't been used already
	local function verifyNewCategory ( category )
		if features[category] then
			error ("Category "..category.." already exists", 3)
		end
	end
	
	-- Adds a new feature of a certain category, returns the index
	function features_api.addFeature ( category, feature )
		verifyNewCategory(category)
		features[category] = uf.createNewFeature(feature)
		return features[category]
	end
	
	function features_api.getFeature(category)
		return uf.getFeature(features[category])
	end
	
	function features_api.getFeatureIndex(category)
		return features[category]
	end
	
	function features_api.featureIterator()
		return pairs(features)
	end
	
	function features_api.numFeatures()
		-- This could probably be stored in the table itself
		local count = 0
		for _, _ in pairs(features) do
			count = count + 1
		end	
		return count
	end
	
	-- Sets a feature of a new category to an existing value (indexed by index)
	function features_api.setFeatureIndex ( category, index )
		verifyNewCategory(category) 
		if (not uf.getFeature(index)) then
			error ("Index "..index.." points to inexistent feature.")
		end
		features[category] = index
	end
	
	-- for debugging purposes
	function features_api.listFeatures () 
		for category, index in pairs(features) do
			mw.log(category.." == "..tostring(uf.getFeature(index)))
		end
	end
	
	for category, feature in pairs(new_features) do
		features_api.addFeature(category, feature)
	end
	
	return features_api
end

local function newForm ( spelling, new_features )
	local form = { spelling = spelling, features = featureList(new_features) }
	
	local function tostring(self)
		return self.spelling
	end
	
	setmetatable(form, { __index = form.features, __tostring = tostring })
	
	-- for debugging purposes
	function form.log ( index )
		index = index or ""
		mw.log("Form "..index..": '"..form.spelling.."'")
		form.features.listFeatures()
	end
	
	return form
end

function p.newLexeme ( lemma, part_of_speech, new_features )
	local lexeme = { lemma = lemma, pos = part_of_speech, features = featureList (new_features or {}), forms = {} }
	
	local function tostring(self)
		if (#self.forms > 0) then
			return self.forms[1].spelling
		else
			return self.lemma
		end
	end
	
	-- features functions are be accessible at the lexeme level for convenience
	setmetatable(lexeme, { __index = lexeme.features, __tostring = tostring })

	-- Adds a new form and returns it
	function lexeme.addForm ( spelling, form_features )
		form_features = form_features or {}
		local form = newForm (spelling, form_features)
		table.insert(lexeme.forms, form)
		return form
	end
	
	-- Clears all forms and optionally creates a new single form.
	-- This is handy when we want to overwrite the existing forms
	function lexeme.replaceByForm ( new_single_form )
		lexeme.forms = {}
		if new_single_form then
			lexeme.addForm(new_single_form)
		end
	end
	
	-- Sorts the form according to gf.cannonical_order
	function lexeme.sortForms()
		local function compare_forms (form1, form2)
			for _, category_order in ipairs(gf.cannonical_order) do
				local category=category_order.category
				local rank1 = category_order[form1.getFeature(category)]
				local rank2 = category_order[form2.getFeature(category)]
				if rank1 ~= rank2 then
					if not rank2 then 
						return true
					elseif not rank1 then
						return false
					else
						return rank1 < rank2
					end
				end
			end
			-- If all cannonical features are equal, prefer the form with less
			-- features overall as smaller
			return (form1.numFeatures() < form2.numFeatures())
		end
		
		table.sort(lexeme.forms, compare_forms)
	end
						
	
	-- This function removes the forms which don't match the general lexeme
	-- features/constraints.
	-- Returns the number of forms which are kept
	function lexeme.filterForms()
		new_forms = {}
		-- Iterate on forms
		for _, form in ipairs(lexeme.forms) do
			local keep_form = true
			-- Iterate on lexeme constraints
			for category, index in lexeme.featureIterator() do
				-- Note that if the form lacks the category, it can be kept
				form_feature_index = form.getFeatureIndex(category)
				-- In a more strict mode, we should require the form features
				-- to strictly subsume the constraints.
				if (form_feature_index and not uf.unifiable(form_feature_index, index)) then
					mw.log("Discard form "..form.spelling.." due to mismatch with feature '"..uf.getFeature(index).."' of category "..category)
					keep_form = false
					break
				end
			end
			if keep_form then
				mw.log("Keeping form "..form.spelling)
				table.insert(new_forms, form)
			end
		end
		lexeme.forms = new_forms
		return #new_forms
	end
	
	-- for debugging purposes
	function lexeme.log ()
		mw.log("Lemma: "..lemma.." ("..lexeme.pos..")")
		lexeme.features.listFeatures()
		for index, form in pairs(lexeme.forms) do
			form.log(index)
		end
	end
	
	return lexeme
end

-- Unify features of given categories in two lexemes
function p.unifyFeatures ( category1, lexeme1, lexeme2, category2 )
	
	-- Unify the same category across both lexemes, if only one is provided
	category2 = category2 or category1
	local index1 = lexeme1.getFeatureIndex(category1)
	local index2 = lexeme2.getFeatureIndex(category2)

	if (not index1 and not index2) then -- unification of two empty features
		-- in order to unify them, we have to create a new feature
		index1 = lexeme1.addFeature(category1, '')
		lexeme2.setFeatureIndex(category2, index1)
		return ''
	elseif (not index2) then -- point lexeme2's feature to lexeme1's
		lexeme2.setFeatureIndex(category2, index1)
		return uf.getFeature(index1)
	elseif (not index1) then -- and conversely
		lexeme1.setFeatureIndex(category1, index2)
		return uf.getFeature(index2)
	else  -- unify the two features
		local result = uf.unify(index1, index2)
		if (result == nil) then
			error ("Features "..uf.getFeature(index1).." and "..uf.getFeature(index2).." are not unifiable", 2)
		end
		return result
	end
end

-- Unify a feature of a given category of a lexeme with a new feature
function p.unifyWithFeature ( category, lexeme, feature )
	local index = lexeme.getFeatureIndex(category)
	if (not index) then -- create feature
		lexeme.addFeature(category, feature)
		return feature
	end
	local result = uf.unifyWithFeature(index, feature) 
	if (result == nil) then
		error ("Features "..uf.getFeature(index).." and "..feature.." are not unifiable", 2)
	end
	return result	
end
		
return p