Module:Multilingual description

local p = {}

--[==[
  Remap some "valid" language codes that are still unknown, but are known by another code
  in order to get a visible language name (and if possible, BCP 47 conformance)!
--]==]
local remappedLanguages = {
    ['als'] = 'gsw', -- known code (unfortunately broken in Wikimedia) to changed new code (also known, but conforming to BCP 47); 'als' on Wikimedia for Alemannic conflicts with the standard 'als' which actually refers to the unrelated stardard variant of the Albanian language.
    ['nrm'] = 'nrf', -- known code (unfortunately broken in Wikimedia) to changed new code (also known, but conforming to BCP 47); 'nrm' on Wikimedia for Norman conflicts with the standard 'nrm' which actually refers to the unrelated Narom language

    ['bat-smg'] = 'sgs', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki
    ['be-x-old'] = 'be-tarask', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki
    ['fiu-vro'] = 'vro', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki
    ['roa-rup'] = 'rup', -- code not conforming to BCP 47 (legacy interwiki still supported, newer code supported as well), the new standard code should be used (and is now recognized as interwiki)
    ['roa-tara'] = 'nap-taran', -- code not conforming to BCP 47 (continental variant of Neapolitan), should be replaced by a conforming variant code
    ['zh-classical'] = 'lzh', -- code not conforming to BCP 47, replaced by standard code also supported in interwiki

    ['bh'] = 'bho', -- legacy interwiki still supported but ambiguous as a family, newer code now supported as well in interwiki; 'bh' was used in Wikimedia to refer to Bhojpuri only and not the whole Bihari family
    ['bu'] = 'my', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki
    ['iw'] = 'he', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki
    ['jw'] = 'jv', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki
    ['zh-min-nan'] = 'nan', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki
    ['zh-wuu'] = 'wuu', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki
    ['zh-yue'] = 'yue', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki

    -- Standard BCP 47 codes that can still not be used in MediaWiki, and must be replaced for now by valid BCP 47 codes (ignoring the specific variant).

    ['en-us'] = 'en', -- both codes are conforming and supported, only the second one is known (there's no support for the US variant which is implicit, unlike variants in CA, GB, IN, ZA)
    ['fa-af'] = 'fa', -- both codes are conforming and supported, only the second one is known, actually means "Eastern Dari"
    ['fr-x-galo'] = 'fr', -- both codes are conforming and supported, only the second one is known
    ['ha-latn'] = 'ha', -- both codes are conforming and supported, only the second one is known, the Latin script is the default since the 1950s
    ['ha-arab'] = 'ha', -- both codes are conforming and supported, only the second one is known, the Arabic script is historic, without clear orthography
    ['ko-kr'] = 'ko', -- both codes are conforming and supported, only the second one is known, regional variant used in South Korea (kr-kp for the variant in North Korea is supported)
    ['ku-cyrl'] = 'ku', -- both codes are conforming and supported, only the second one is known, the Cyrillic script is still used
    ['ku-latn'] = 'ku', -- both codes are conforming and supported, only the second one is known, the Latin script is the default since the 1950s
    ['no'] = 'nb', -- both codes are conforming and supported, but the 1st one is now used only for meaning the second one in MediaWiki
    ['prd'] = 'fa', -- both codes are conforming and supported, only the second one is known; 'prd' is "Parsi-Dari", and means the same as 'fa-af'
    ['tgl'] = 'tl', -- both codes are conforming and supported, only the second one is known (but may have alternate forms written in the Tagal script and not Latin for modern Filipino)

    -- Standard BCP 47 codes for variants that can still not be used in MediaWiki, and must be replaced for now by legacy codes, valid only in Wikimedia wikis but not conforming to BCP 47.

    ['sr-cyrl'] = 'sr-ec', -- this alternate known code is non-standard and in fact not supported, but has a correct native name
    ['sr-latn'] = 'sr-el', -- same remark
}

local sortedKnownLanguageTags = require('Module:Multilingual description/sort')
local dir = require('Module:Dir').select

local function addDescription(descriptions, lang, description, update)
    if type(description) == 'string' then
        if mw.text.trim(description):len() > 0 then
            table.insert(descriptions, mw.getCurrentFrame():expandTemplate{
                title = 'Ls',
                args = {
                    lang,
                    description,
                    dir = dir(lang, 'rtl', 'ltr'),
                    classes = 'description',
                    update = update
                }
            })
        end
    end
end

-- Kind is either 'deprecated', 'conflicting', or 'unsupported'.
local function addTracking(descriptions, kind)
    table.insert(descriptions, '[[Category:Multilingual descriptions using ' .. kind .. ' language codes]]')
end

local function _mld(args)
    --[==[
    Shallow copy of arguments (because keys in args cannot be unset if args is hollow, in a parent
    frame outside Lua). DO NOT copy the metatable that exposes only a *read-only* interface with
    accessors to PHP arrays (mw.clone does NOT work)!
    --]==]
    local descriptions, conflicting = {}, false
    for lang, description in pairs(args) do
        if type(lang) == 'string' and type(description) == 'string' then
            --[==[
            MediaWiki trims the names of named argument and their values, but does not remove HTML comments
            in these names (some Mld contain parameters like "| sk <!--Slovensko--> = ...") or "nowiki" tags.
            After removing them, we still need to trim the rest in language codes and in descriptions, to
            detect conflicting descriptions for the same language code.
            --]==]
            description = description
                :gsub('<!%-%-.-%-%->', '')
                :gsub('</?nowiki%s*/?>', '')
                :gsub('^%s*(.-)%s*$','%1')
            --[==[
            Split multiple language codes (or default) assigned with the same description.
            Valid BCP 47 language codes contain only ASCII letters, digits, hyphens or
            underscores: canonicalize them to lowercase with hyphens replacing underscores
            (other characters are considered separators between language codes).
            --]==]
            for code in lang
                    :gsub('<!%-%-.-%-%->', '')
                    :gsub('</?nowiki%s*/?>', '')
                    :gsub('_', '-'):lower():gmatch("([%-0-9a-z]+)") do
                --[==[
                Detect conflicting descriptions, like "|en,default=OK|en=Bad"
                (whose result is unpredictable, as all keys are in random order).
                --]==]
                if descriptions[code] ~= nil and descriptions[code] ~= description then
                    conflicting = true
                end
                descriptions[code] = description
            end
        end
    end
    args, descriptions = descriptions, {}
    --[==[ Remap legacy language codes if there's no conflict. ]==]
    local remapped = false
    for cur, alt in pairs(remappedLanguages) do
        if args[cur] and not(mw.language.isSupportedLanguage(cur) and mw.language.isKnownLanguageTag(cur))
                and (mw.language.isSupportedLanguage(alt) and mw.language.isKnownLanguageTag(alt)) then
            if args[alt] == nil then -- only if this does not conflict
                args[alt] = args[cur] -- set description for the alternate known language
            else
                remapped = true -- signal only in case of conflict
                conflicting = true
            end
            args[cur] = nil -- unset the description for the initial language code
        end
    end
    --[==[ First all known languages in order if they have description. ]==]
    for _, lang in ipairs(sortedKnownLanguageTags) do
        if args[lang] ~= nil then
            addDescription(descriptions, lang, args[lang], nil)
            args[lang] = nil
        end
    end
    --[==[ Append other unknown languages, but only if they are supported. ]==]
    local unsupported = false
    for lang, description in pairs(args) do
        if mw.language.isSupportedLanguage(lang) then
            addDescription(descriptions, lang, description, nil)
        else
            addDescription(descriptions, lang, description, lang)
            unsupported = true
        end
    end
    if conflicting then
        addTracking(descriptions, 'conflicting')
    end
    if remapped then
        addTracking(descriptions, 'deprecated')
    end
    if unsupported then
        addTracking(descriptions, 'unsupported')
    end
--mw.logObject(descriptions)
    return table.concat(descriptions)
end

function p.mld(frame)
    local args = (frame:getParent() or {}).args or {}
    return _mld(args)
end

setmetatable(p, {quickTests = function()
    local input = {
        [1] = 'One?', -- discarded (no support for language numeric keys)
        unsupported = 'What?', -- unsupported
        en = ' ', -- empty description after trimming (discarded)
        als = 'GSW', -- will be remapped
        ['en-gb '] = 'EN-GB', -- trimming at end
        ['en-ca <nowiki/>'] = 'EN-CA',
        [' de'] = 'DE', -- trimming at start
        fr = 'FR',
        [' fr '] = 'FR', -- trimming both ends (description not conflicting)
        rue = 'RUE',
        ru = 'RU',
        ko = 'KO',
        ja = 'JA',
        zh = 'ZH',
        ['he,iw'] = 'HE', -- 'iw' remapped to 'he' (description not conflicting)
        ur = 'UR',
        ar = 'AR',
        ro = 'RO',
        ['be-tarask'] = 'BE-TARASK',
        ['be-x-old'] = 'BE-X-OLD (deprecated)',
        dv = 'DV',
    }
    local expect = {}
    --[==[
    This is the exact order to expect according to native language names,
    and after discarding empty descriptions or unsupported language codes.
    --]==]
    addDescription(expect, 'gsw', 'GSW') --[[Alemannisch]] -- remapped
    addDescription(expect, 'en-gb', 'EN-GB') --[[British English]]
    addDescription(expect, 'en-ca', 'EN-CA') --[[Canadian English]]
    addDescription(expect, 'de', 'DE') --[[Deutsch]]
    addDescription(expect, 'fr', 'FR') --[[français]]
    addDescription(expect, 'ro', 'RO') --[[română]]
    addDescription(expect, 'be-tarask', 'BE-TARASK') --[[беларуская (тарашкевіца)]]
    -- addDescription(expect, 'be-x-old', 'BE-X-OLD (deprecated)') --[[беларуская (тарашкевіца)]] -- discarded due to conflict
    addDescription(expect, 'rue', 'RUE') --[[русиньскый]]
    addDescription(expect, 'ru', 'RU') --[[русский]]
    addDescription(expect, 'ko', 'KO') --[[한국어]]
    addDescription(expect, 'ja', 'JA') --[[日本語]]
    addDescription(expect, 'zh', 'ZH') --[[中文]]
    addDescription(expect, 'he', 'HE') --[[עברית]]
    addDescription(expect, 'ur', 'UR') --[[اردو]]
    addDescription(expect, 'ar', 'AR') --[[العربية]]
    addDescription(expect, 'dv', 'DV') --[[ދިވެހިބަސް]]
    --[==[
    Note that unknown/unsorted languages may occur here in unpredictable
    order at end of this list, but only if they are "supported" (other
    will be discarded). So we can only test for the presence of one such
    item.
    --]==]
    addDescription(expect, 'unsupported', 'What?', 'unsupported')
    addTracking(expect, 'conflicting')
    addTracking(expect, 'deprecated')
    addTracking(expect, 'unsupported')
    expect = table.concat(expect)
    local actual = _mld(input)
    if (actual ~= expect) then
        mw.log('expect:\n' .. expect)
        mw.log('actual:\n' .. actual)
        return false
    end
    return true
end})
--[==[ Type this to run tests in the Lua console:
=getmetatable(p).quickTests() -- should return true
--]==]
return p