Modulus:Lang/data

Documentation for this module may be created at Modulus:Lang/data/doc
--[[--------------------------< L A N G _ N A M E _ T A B L E >------------------------------------------------

primary table of tables that decode:
	lang -> language tags and names
	script -> ISO 15924 script tags
	region -> ISO 3166 region tags
	variant -> iana registered variant tags
	suppressed -> map of scripts tags and their associated language tags
	
all of these data come from separate modules that are derived from the IANA language-subtag-registry file

key_to_lower() avoids the metatable trap and sets all keys in the subtables to lowercase. Many language codes
have multiple associated names; Module:lang is only concerned with the first name so key_to_lower() only fetches
the first name.

]]

local function key_to_lower (module, src_type)
	local out = {};
	local source = (('var_sup' == src_type) and require (module)) or mw.loadData (module);		-- fetch data from this module; require() avoids metatable trap for variant data
	if 'var_sup' == src_type then
		for k, v in pairs (source) do
			out[k:lower()] = v;													-- for variant and suppressed everything is needed
		end

	elseif 'lang' == src_type and source.active then							-- for ~/iana_languages (active)
		for k, v in pairs (source.active) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end

	elseif 'lang_dep' == src_type and source.deprecated then					-- for ~/iana_languages (deprecated)
		for k, v in pairs (source.deprecated) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end

	else																		-- here for all other sources
		for k, v in pairs (source) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end
	end
	return out;
end

local lang_name_table = {
	lang = key_to_lower ('Module:Language/data/iana languages', 'lang'),
	lang_dep = key_to_lower ('Module:Language/data/iana languages', 'lang_dep'),
	script = key_to_lower ('Module:Language/data/iana scripts'),				-- script keys are capitalized; set to lower
	region = key_to_lower ('Module:Language/data/iana regions'),				-- region keys are uppercase; set to lower
	variant = key_to_lower ('Module:Language/data/iana variants', 'var_sup'),
	suppressed = key_to_lower ('Module:Language/data/iana suppressed scripts', 'var_sup'),	-- script keys are capitalized; set to lower
	}


--[[--------------------------< O V E R R I D E >--------------------------------------------------------------

Language codes and names in this table override the BCP47 names in lang_name_table.

indexes in this table shall always be lower case

]]

local override = {
	["aa"] = "Afarice",
	["ab"] = "Abasgice",
	["ady"] = "Adygeice",
	["ae"] = "Avestane",
	["af"] = "lingua Batava Capitensi",
	["ak"] = "linguis Akan",
	["akk"] = "Accadice",
	["am"] = "Amharice",
	["an"] = "lingua Aragonensi",
	["ang"] = "Anglosaxonice",
	["ar"] = "Arabice",
	["arc"] = "Aramaice",
	["as"] = "Assamice",
	["av"] = "Avarice",
	["ay"] = "Aymarice",
	["az"] = "Atropatenice",
	["ba"] = "Baschkirice",
	["be"] = "Albaruthenice",
	["bg"] = "Bulgarice",
	["bh"] = "lingua Bihari",
	["bi"] = "Bislama",
	["bm"] = "Bambara",
	["bn"] = "Bengalice",
	["bo"] = "Tibetane",
	["br"] = "Britonice",
	["bs"] = "Bosnice",
	["ca"] = "Catalane",
	["ce"] = "Tsetsenice",
	["ch"] = "Chamoruane",
	["co"] = "Corsice",
	["cop"] = "Coptice",
	["cr"] = "lingua Cree",
	["crh"] = "Tatarice Taurice",
	["cs"] = "Cechice",
	["cu"] = "lingua Slavica ecclesiastica",
	["cv"] = "Tschuwaschice",
	["cy"] = "Cambrice",
	["da"] = "Danice",
	["de"] = "Theodisce",
	["dv"] = "lingua Dhivehi",
	["dz"] = "lingua Dzongkha",
	["ee"] = "lingua Ewe",
	["el"] = "Neograece",
	["en"] = "Anglice",
	["eo"] = "Esperantice",
	["es"] = "Hispanice",
	["et"] = "Estonice",
	["eu"] = "Vasconice",
	["fa"] = "Persice",
	["ff"] = "lingua Fula",
	["fi"] = "Finnice",
	["fj"] = "lingua Vitiensi",
	["fo"] = "lingua Faeroensi",
	["fr"] = "Francogallice",
	["fy"] = "lingua Frisica occidentali",
	["ga"] = "Hibernice",
	["gd"] = "Gadelice",
	["gez"] = "Aethiopice",
	["gl"] = "Gallaice",
	["gn"] = "Guaranice",
	["got"] = "Gothice",
	["grc"]= "Graece",
	["gu"] = "lingua Gujaratensi",
	["gv"] = "lingua Monensi",
	["ha"] = "Haussane",
	["haw"] = "Havaiane",
	["he"] = "Hebraice",
	["hi"] = "Hindice",
	["hno"] = "lingua Hindko",
	["ho"] = "lingua Hiri Motu",
	["hr"] = "Croatice",
	["ht"] = "Haitiane",
	["hu"] = "Hungarice",
	["hy"] = "Armenice",
	["hz"] = "lingua Herero",
	["ia"] = "lingua Interlingua",
	["id"] = "Indonesice",
	["ie"] = "Interlingue",
	["ig"] = "lingua Igbo",
	["ii"] = "lingua Yi Sichuanensi",
	["ik"] = "lingua Inupiaq",
	["io"] = "lingua Ido",
	["is"] = "Islandice",
	["it"] = "Italiane",
	["iu"] = "lingua Inuktitut",
	["ja"] = "Iaponice",
	["jv"] = "Iavanice",
	["ka"] = "Georgiane",
	["kg"] = "lingua Kongo",
	["ki"] = "lingua Kikuyu",
	["kj"] = "lingua Kuanyama",
	["kk"] = "Casachice",
	["kl"] = "Groenlandice",
	["km"] = "Chmerice",
	["kn"] = "Cannadice",
	["ko"] = "Coreane",
	["kr"] = "lingua Kanuri",
	["ks"] = "Casmirice",
	["ku"] = "Curdice",
	["kv"] = "lingua Komiensi",
	["kw"] = "Cornubice",
	["ky"] = "Kyrgyzice",
	["la"] = "Latine",
	["lb"] = "Luxemburgice",
	["lg"] = "lingua Luganda",
	["li"] = "Limburgice",
	["ln"] = "lingua Lingala",
	["lo"] = "lingua Lao",
	["lt"] = "Lituane",
	["lu"] = "lingua Luba-Katanga",
	["lv"] = "Lettonice",
	["mg"] = "lingua Malagasiensi",
	["mh"] = "lingua Marsaliensi",
	["mi"] = "Maoriane",
	["mk"] = "Macedonice",
	["ml"] = "Malabarice",
	["mn"] = "Mongolice",
	["mnc"] = "Mandshurice",
	["mr"] = "Marathice",
	["ms"] = "Malaice",
	["mt"] = "lingua Melitensi",
	["my"] = "Birmanice",
	["na"] = "Nauruanice",
	["nb"] = "lingua Norvegica libraria",
	["nd"] = "lingua Ndebele boreali",
	["ne"] = "macrolingua Nepalensi",
	["ng"] = "lingua Ndonga",
	["nl"] = "Batavice",
	["nn"] = "lingua Norvegica novella",
	["no"] = "Norvegice",
	["nr"] = "lingua Ndebele australi",
	["nv"] = "lingua Navajo",
	["ny"] = "lingua Nyanja",
	["oc"] = "Occitane",
	["oj"] = "lingua Ojibwayensi",
	["om"] = "lingua Oromo",
	["or"] = "lingua Orissensi",
	["orv"] = "lingua Russica antiqua",
	["os"] = "Ossetice",
	["pa"] = "Pengabice",
	["peo"] = "lingua Persica antiqua",
	["phn"] = "Phoenicice",
	["pi"] = "Palice",
	["pl"] = "Polonice",
	["pmt"] = "Tuamotuane",
	["pnb"] = "lingua Pengabica occidentali",	
	["ps"] = "Afganice",
	["pt"] = "Lusitane",
	["qu"] = "lingua Quechua",
	["rm"] = "Rhaetice",
	["rn"] = "lingua Rundi",
	["ro"] = "Dacoromanice",
	["ru"] = "Russice",
	["rw"] = "lingua Rwanda",
	["sa"] = "Sanscritice",
	["sc"] = "Sarde",
	["scn"] = "Sicule",	
	["sd"] = "Sindhuice",
	["se"] = "lingua Lapponica septentrionali",
	["sg"] = "lingua Sango",
	["sh"] = "Serbocroatice",
	["si"] = "lingua Singhalensi",
	["sk"] = "Moravice",
	["sl"] = "Slovene",
	["sm"] = "Samoane",
	["sn"] = "lingua Shona",
	["so"] = "Somalice",
	["sq"] = "Albanice",
	["sr"] = "Serbice",
	["ss"] = "lingua Swati",
	["st"] = "lingua Sotho australi",
	["su"] = "lingua Sundanensi",
	["sux"] = "Sumerice",
	["sv"] = "Suedice",
	["sw"] = "Suahelice",
	["syr"] = "Syriace",
	["ta"] = "Tamulice",
	["te"] = "Telingane",
	["tg"] = "Tadiciane",
	["th"] = "lingua Thai",
	["ti"] = "lingua Tigrinya",
	["tk"] = "Turcomannice",
	["tl"] = "lingua Tagalog",
	["tli"] = "Tlingitice",
	["tn"] = "lingua Tswana",
	["to"] = "Tongane",
	["tr"] = "Turcice",
	["ts"] = "lingua Tsonga",
	["tt"] = "Tatarice",
	["tw"] = "lingua Twi",
	["ty"] = "Tahitiane",
	["ug"] = "Uigurice",
	["uk"] = "Ucrainice",
	["ur"] = "lingua Urdu",
	["uz"] = "Uzbecice",
	["ve"] = "lingua Venda",
	["vi"] = "Vietnamice",
	["vo"] = "lingua Volapük",
	["wa"] = "Vallonice",
	["wo"] = "lingua Wolof",
	["xh"] = "Xosane",
	["xmf"] = "Mingrelice",
	["xpu"] = "Punice",
	["yi"] = "Iudaeogermanice",
	["yo"] = "lingua Yoruba",
	["za"] = "lingua Zhuang",
	["zh"] = "Sinice",
	["zu"] = "Zuluane"
	}

--[[--------------------------< A R T I C L E _ L I N K >------------------------------------------------------

for those rare occasions when article titles don't fit with the normal '<language name>-language', this table
maps language code to article title. Use of this table should be avoided and the use of redirects preferred as
that is the long-standing method of handling article names that don't fit with the normal pattern

]]

local article_name = {
	["lij"] = "Ligurian (Romance language)",									-- Ligurian; see Template_talk:Lang#Ligurian_dab
	['mnh'] = "Mono language (Congo)",											-- Mono (Democratic Republic of Congo); see Template_talk:Lang#Mono_languages
	['mnr'] = "Mono language (California)",										-- Mono (USA)
	['mru'] = "Mono language (Cameroon)",										-- Mono (Cameroon)
	["xlg"] = "Ligurian (ancient language)",									-- see Template_talk:Lang#Ligurian_dab
	}


--[=[-------------------------< R T L _ S C R I P T S >--------------------------------------------------------

ISO 15924 scripts that are written right-to-left. Data in this table taken from [[ISO 15924#List of codes]]

last update to this list: 2017-12-24

]=]

local rtl_scripts = {
	'adlm', 'arab', 'aran', 'armi', 'avst', 'cprt', 'egyd', 'egyh', 'hatr', 'hebr',
	'hung', 'inds', 'khar', 'lydi', 'mand', 'mani', 'mend', 'merc', 'mero', 'narb',
	'nbat', 'nkoo', 'orkh', 'palm', 'phli', 'phlp', 'phlv', 'phnx', 'prti', 'rohg',
	'samr', 'sarb', 'sogd', 'sogo', 'syrc', 'syre', 'syrj', 'syrn', 'thaa', 'wole',
	};


--[[--------------------------< T R A N S L I T _ T I T L E S >------------------------------------------------

This is a table of tables of transliteration standards and the language codes or language scripts that apply to
those standards. This table is used to create the tool-tip text associated with the transliterated text displayed
by some of the {{lang-??}} templates.

These tables are more-or-less copied directly from {{transl}}. The standard 'NO_STD' is a construct to allow for
the cases when no |std= parameter value is provided.

]]

local translit_title_table = {
	['ahl'] = {
		['default'] = 'Academy of the Hebrew Language transliteration',
		},

	['ala'] = {
		['default'] = 'American Library Association – Library of Congress transliteration',
		},

	['ala-lc'] = {
		['default'] = 'American Library Association – Library of Congress transliteration',
		},

	['batr'] = {
		['default'] = 'Bikdash Arabic Transliteration Rules',
		},

	['bgn/pcgn'] = {
		['default'] = 'Board on Geographic Names / Permanent Committee on Geographical Names transliteration',
		},

	['din'] = {
		['ar'] = 'DIN 31635 Arabic',
		['fa'] = 'DIN 31635 Arabic',
		['ku'] = 'DIN 31635 Arabic',
		['ps'] = 'DIN 31635 Arabic',
		['tg'] = 'DIN 31635 Arabic',
		['ug'] = 'DIN 31635 Arabic',
		['ur'] = 'DIN 31635 Arabic',
		['arab'] = 'DIN 31635 Arabic',

		['default'] = 'DIN transliteration',
		},

	['eae'] = {
		['default'] = 'Encyclopaedia Aethiopica transliteration',
		},

	['hepburn'] = {
		['default'] = 'Hepburn transliteration',
		},

	['hunterian'] = {
		['default'] = 'Hunterian transliteration',
		},

	['iast'] = {
		['default'] = 'International Alphabet of Sanskrit transliteration',
		},

	['iso'] = {																	-- when a transliteration standard is supplied
		['ab'] = 'ISO 9 Cyrillic',
		['ba'] = 'ISO 9 Cyrillic',
		['be'] = 'ISO 9 Cyrillic',
		['bg'] = 'ISO 9 Cyrillic',
		['kk'] = 'ISO 9 Cyrillic',
		['ky'] = 'ISO 9 Cyrillic',
		['mn'] = 'ISO 9 Cyrillic',
		['ru'] = 'ISO 9 Cyrillic',
		['tg'] = 'ISO 9 Cyrillic',
		['uk'] = 'ISO 9 Cyrillic',
		['bua'] = 'ISO 9 Cyrillic',
		['sah'] = 'ISO 9 Cyrillic',
		['tut'] = 'ISO 9 Cyrillic',
		['xal'] = 'ISO 9 Cyrillic',
		['cyrl'] = 'ISO 9 Cyrillic',

		['ar'] = 'ISO 233 Arabic',
		['ku'] = 'ISO 233 Arabic',
		['ps'] = 'ISO 233 Arabic',
		['ug'] = 'ISO 233 Arabic',
		['ur'] = 'ISO 233 Arabic',
		['arab'] = 'ISO 233 Arabic',

		['he'] = 'ISO 259 Hebrew',
		['yi'] = 'ISO 259 Hebrew',
		['hebr'] = 'ISO 259 Hebrew',

		['el'] = 'ISO 843 Greek',
		['grc'] = 'ISO 843 Greek',

		['ja'] = 'ISO 3602 Japanese',
		['hira'] = 'ISO 3602 Japanese',
		['hrkt'] = 'ISO 3602 Japanese',
		['jpan'] = 'ISO 3602 Japanese',
		['kana'] = 'ISO 3602 Japanese',

		['zh'] = 'ISO 7098 Chinese',
		['chi'] = 'ISO 7098 Chinese',
		['pny'] = 'ISO 7098 Chinese',
		['zho'] = 'ISO 7098 Chinese',
--		['han'] = 'ISO 7098 Chinese',											-- unicode alias of Hani? doesn't belong here? should be Hani?
		['hans'] = 'ISO 7098 Chinese',
		['hant'] = 'ISO 7098 Chinese',

		['ka'] = 'ISO 9984 Georgian',
		['kat'] = 'ISO 9984 Georgian',

		['arm'] = 'ISO 9985 Armenian',
		['hy'] = 'ISO 9985 Armenian',

		['th'] = 'ISO 11940 Thai',
		['tha'] = 'ISO 11940 Thai',

		['ko'] = 'ISO 11941 Korean',
		['kor'] = 'ISO 11941 Korean',

		['awa'] = 'ISO 15919 Indic',
		['bho'] = 'ISO 15919 Indic',
		['bn'] = 'ISO 15919 Indic',
		['bra'] = 'ISO 15919 Indic',
		['doi'] = 'ISO 15919 Indic',
		['dra'] = 'ISO 15919 Indic',
		['gon'] = 'ISO 15919 Indic',
		['gu'] = 'ISO 15919 Indic',
		['hi'] = 'ISO 15919 Indic',
		['inc'] = 'ISO 15919 Indic',
		['kn'] = 'ISO 15919 Indic',
		['kok'] = 'ISO 15919 Indic',
		['ks'] = 'ISO 15919 Indic',
		['mag'] = 'ISO 15919 Indic',
		['mai'] = 'ISO 15919 Indic',
		['ml'] = 'ISO 15919 Indic',
		['mr'] = 'ISO 15919 Indic',
		['ne'] = 'ISO 15919 Indic',
		['new'] = 'ISO 15919 Indic',
		['or'] = 'ISO 15919 Indic',
		['pa'] = 'ISO 15919 Indic',
		['raj'] = 'ISO 15919 Indic',
		['sa'] = 'ISO 15919 Indic',
		['sat'] = 'ISO 15919 Indic',
		['sd'] = 'ISO 15919 Indic',
		['si'] = 'ISO 15919 Indic',
		['ta'] = 'ISO 15919 Indic',
		['tcy'] = 'ISO 15919 Indic',
		['te'] = 'ISO 15919 Indic',
		['beng'] = 'ISO 15919 Indic',
		['brah'] = 'ISO 15919 Indic',
		['deva'] = 'ISO 15919 Indic',
		['gujr'] = 'ISO 15919 Indic',
		['guru'] = 'ISO 15919 Indic',
		['knda'] = 'ISO 15919 Indic',
		['mlym'] = 'ISO 15919 Indic',
		['orya'] = 'ISO 15919 Indic',
		['sinh'] = 'ISO 15919 Indic',
		['taml'] = 'ISO 15919 Indic',
		['telu'] = 'ISO 15919 Indic',

		['default'] = 'ISO transliteration',
		},

	['jyutping'] = {
		['default'] = 'Jyutping transliteration',
		},

	['mlcts'] = {
		['default'] = 'Myanmar Language Commission Transcription System',
		},

	['mr'] = {
		['default'] = 'McCune–Reischauer transliteration',
		},

	['nihon-shiki'] = {
		['default'] = 'Nihon-shiki transliteration',
		},

	['no_std'] = {																-- when no transliteration standard is supplied
		['akk'] = 'Semitic transliteration',
		['sem'] = 'Semitic transliteration',
		['phnx'] = 'Semitic transliteration',
		['xsux'] = 'Cuneiform transliteration',
		},

	['pinyin'] = {
		['default'] = 'Pinyin transliteration',
		},

	['rr'] = {
		['default'] = 'Revised Romanization of Korean transliteration',
		},

	['rtgs'] = {
		['default'] = 'Royal Thai General System of Transcription',
		},
	
	['satts'] = {
		['default'] = 'Standard Arabic Technical Transliteration System transliteration',
		},

	['scientific'] = {
		['default'] = 'scientific transliteration',
		},

	['ukrainian'] = {
		['default'] = 'Ukrainian National system of romanization',
		},

	['ungegn'] = {
		['default'] = 'United Nations Group of Experts on Geographical Names transliteration',
		},

	['wadegile'] = {
		['default'] = 'Wade–Giles transliteration',
		},

	['wehr'] = {
		['default'] = 'Hans Wehr transliteration',
		},
	};


return
	{
	article_name = article_name,
	lang_name_table = lang_name_table,
	override = override,
	rtl_scripts = rtl_scripts,
	special_tags_table = special_tags_table,
	translit_title_table = translit_title_table,
	};