Page MenuHomePhabricator

Identify language communities without Wikipedia that may or may not have Incubator
Open, MediumPublic

Description

This task involves identifying language communities that:

  • Do not yet have a Wikipedia.
  • May or may not be in the Incubator.
  • Lack quality machine translation services.

One approach to accomplish this is to examine all the languages listed on https://translate.wmcloud.org/ and develop an automated solution (a Python or JavaScript script) to identify languages from this list that do not have a Wikipedia or are not present in the Incubator.

This approach aims to facilitate starting with languages already in the Incubator, making it easier to contact administrators for the outreach plan to gather information about languages for Opus MT machine translation models.

Event Timeline

srishakatux created this task.

Script below written in Python checks Wikipedia existence for each language listed in the dropdown list in the right section at https://translate.wmcloud.org/ and prints the languages for which Wikipedia doesn't exist:

import requests

def check_wikipedia_language(language, language_name):
    try:
        response = requests.get(f"https://{language}.wikipedia.org", timeout=5)
        if response.status_code == 200:
            return False
        else:
            return True
    except requests.RequestException as e:
        return True

def main():
    languages = {
        "ace": "Achinese", 
        "acq": "acq", 
        "ady": "Adyghe", 
        "af": "Afrikaans", 
        "ajp": "ajp", 
        "ak": "Akan", 
        "sq": "Albanian",
        "am": "Amharic", 
        "ar": "Arabic", 
        "an": "Aragonese", 
        "hy": "Armenian", 
        "frp": "Arpitan", 
        "as": "Assamese",
        "ast": "Asturian", 
        "av": "Avaric", 
        "awa": "Awadhi", 
        "ay": "Aymara", 
        "az": "Azerbaijani", 
        "ban": "Balinese",
        "bm": "Bambara", 
        "bn": "Bangla", 
        "bjn": "Banjar", 
        "ba": "Bashkir", 
        "eu": "Basque", 
        "bar": "Bavarian",
        "be": "Belarusian", 
        "bem": "Bemba", 
        "bh": "Bhojpuri", 
        "bi": "Bislama", 
        "brx": "Bodo", 
        "bs": "Bosnian",
        "br": "Breton", 
        "bug": "Buginese", 
        "bg": "Bulgarian", 
        "my": "Burmese", 
        "zh-yue": "Cantonese", 
        "ca": "Catalan",
        "ceb": "Cebuano", 
        "tzm": "Central Atlas Tamazight", 
        "bcl": "Central Bikol", 
        "ckb": "Central Kurdish",
        "ch": "Chamorro", 
        "ce": "Chechen", 
        "chr": "Cherokee", 
        "zh": "Chinese", 
        "cjk": "cjk", 
        "kw": "Cornish",
        "cr": "Cree", 
        "crh": "Crimean Tatar", 
        "hr": "Croatian", 
        "cs": "Czech", 
        "da": "Danish", 
        "din": "Dinka",
        "doi": "Dogri", 
        "nl": "Dutch", 
        "dyu": "Dyula", 
        "dz": "Dzongkha", 
        "arz": "Egyptian Arabic", 
        "myv": "Erzya",
        "eo": "Esperanto", 
        "et": "Estonian", 
        "ee": "Ewe", 
        "fo": "Faroese", 
        "hif": "Fiji Hindi", 
        "fj": "Fijian",
        "fi": "Finnish", 
        "fon": "Fon", 
        "fr": "French", 
        "fur": "Friulian", 
        "ff": "Fula", 
        "gag": "Gagauz",
        "gl": "Galician", 
        "gan": "Gan", 
        "lg": "Ganda", 
        "ka": "Georgian", 
        "de": "German", 
        "gom": "Goan Konkani",
        "gor": "Gorontalo", 
        "el": "Greek", 
        "gn": "Guarani", 
        "gu": "Gujarati", 
        "guw": "Gun", 
        "ht": "Haitian Creole",
        "ha": "Hausa", 
        "he": "Hebrew", 
        "hi": "Hindi", 
        "hne": "hne", 
        "hu": "Hungarian", 
        "is": "Icelandic", 
        "io": "Ido",
        "ig": "Igbo", 
        "ilo": "Iloko", 
        "id": "Indonesian", 
        "iu": "Inuktitut", 
        "acm": "Iraqi Arabic", 
        "ga": "Irish",
        "it": "Italian", 
        "jam": "Jamaican Creole English", 
        "ja": "Japanese", 
        "jv": "Javanese", 
        "kbd": "Kabardian",
        "kbp": "Kabiye", 
        "kea": "Kabuverdianu", 
        "kab": "Kabyle", 
        "kac": "Kachin", 
        "kl": "Kalaallisut", 
        "xal": "Kalmyk",
        "kam": "Kamba", 
        "kn": "Kannada", 
        "kr": "Kanuri", 
        "kaa": "Kara-Kalpak", 
        "krc": "Karachay-Balkar", 
        "ks": "Kashmiri",
        "kk": "Kazakh", 
        "km": "Khmer", 
        "ki": "Kikuyu", 
        "kmb": "Kimbundu", 
        "rw": "Kinyarwanda", 
        "knc": "knc", 
        "kv": "Komi",
        "koi": "Komi-Permyak", 
        "kg": "Kongo", 
        "ko": "Korean", 
        "ku": "Kurdish", 
        "ky": "Kyrgyz", 
        "lo": "Lao",
        "ltg": "Latgalian", 
        "lv": "Latvian", 
        "apc": "Levantine Arabic", 
        "lij": "Ligurian", 
        "li": "Limburgish",
        "ln": "Lingala", 
        "lt": "Lithuanian", 
        "lmo": "Lombard", 
        "nds": "Low German", 
        "nds-nl": "Low Saxon",
        "lua": "Luba-Lulua", 
        "luo": "Luo", 
        "lb": "Luxembourgish", 
        "mk": "Macedonian", 
        "mad": "Madurese", 
        "mag": "Magahi",
        "mai": "Maithili", 
        "mg": "Malagasy", 
        "ms": "Malay", 
        "ml": "Malayalam", 
        "mt": "Maltese", 
        "mni": "Manipuri",
        "gv": "Manx", 
        "mi": "Māori", 
        "mr": "Marathi", 
        "min": "Minangkabau", 
        "zh-min-nan": "Minnan", 
        "mwl": "Mirandese",
        "lus": "Mizo", 
        "mdf": "Moksha", 
        "mnw": "Mon", 
        "mn": "Mongolian", 
        "ary": "Moroccan Arabic", 
        "mos": "Mossi",
        "ars": "Najdi Arabic", 
        "nv": "Navajo", 
        "ne": "Nepali", 
        "new": "Newari", 
        "nia": "Nias", 
        "se": "Northern Sami",
        "nso": "Northern Sotho", 
        "no": "Norwegian", 
        "nn": "Norwegian Nynorsk", 
        "nus": "Nuer", 
        "ny": "Nyanja",
        "oc": "Occitan", 
        "or": "Odia", 
        "ang": "Old English", 
        "om": "Oromo", 
        "os": "Ossetian", 
        "pam": "Pampanga",
        "pag": "Pangasinan", 
        "pap": "Papiamento", 
        "ps": "Pashto", 
        "fa": "Persian", 
        "pl": "Polish", 
        "pt": "Portuguese",
        "pa": "Punjabi", 
        "qu": "Quechua", 
        "ro": "Romanian", 
        "rm": "Romansh", 
        "rn": "Rundi", 
        "ru": "Russian",
        "sm": "Samoan", 
        "sg": "Sango", 
        "sa": "Sanskrit", 
        "sat": "Santali", 
        "skr": "Saraiki", 
        "sc": "Sardinian",
        "stq": "Saterland Frisian", 
        "gd": "Scottish Gaelic", 
        "sr": "Serbian", 
        "sh": "Serbo-Croatian", 
        "shn": "Shan",
        "sn": "Shona", 
        "scn": "Sicilian", 
        "szl": "Silesian", 
        "simple": "Simple English", 
        "sd": "Sindhi",
        "si": "Sinhala", 
        "sk": "Slovak", 
        "sl": "Slovenian", 
        "so": "Somali", 
        "azb": "Southern Azerbaijani",
        "alt": "Southern Altai", 
        "st": "Southern Sotho", 
        "es": "Spanish", 
        "srn": "Sranan Tongo", 
        "su": "Sundanese",
        "sw": "Swahili", 
        "ss": "Swati", 
        "sv": "Swedish", 
        "tl": "Tagalog", 
        "ty": "Tahitian", 
        "tg": "Tajik",
        "tly": "Talysh", 
        "ta": "Tamil", 
        "taq": "Tamasheq", 
        "tt": "Tatar", 
        "te": "Telugu", 
        "tet": "Tetum",
        "th": "Thai", 
        "bo": "Tibetan", 
        "ti": "Tigrinya", 
        "tpi": "Tok Pisin", 
        "to": "Tongan", 
        "ts": "Tsonga",
        "tn": "Tswana", 
        "tcy": "Tulu", 
        "tum": "Tumbuka", 
        "aeb": "Tunisian Arabic", 
        "tr": "Turkish", 
        "tk": "Turkmen",
        "tyv": "Tuvan", 
        "tw": "Twi", 
        "uk": "Ukrainian", 
        "umb": "Umbundu", 
        "ur": "Urdu", 
        "ug": "Uyghur", 
        "uz": "Uzbek",
        "ve": "Venda", 
        "vec": "Venetian", 
        "vi": "Vietnamese", 
        "wa": "Walloon", 
        "war": "Waray", 
        "cy": "Welsh",
        "wo": "Wolof", 
        "wuu": "Wu Chinese", 
        "xh": "Xhosa", 
        "yi": "Yiddish", 
        "yo": "Yoruba", 
        "zu": "Zulu"
    }
    print(f"| Language name | ISO code |")

    for language, language_name in languages.items():
        if check_wikipedia_language(language, language_name):
            print(f"| {language_name} | {language} |")

if __name__ == "__main__":
    main()

Output of the script in the comment above includes languages without Wikipedia, along with their names and ISO codes. Here is the list:

Language nameISO code
acqacq
ajpajp
Bembabem
Bodobrx
Central Atlas Tamazighttzm
cjkcjk
Dogridoi
Dyuladyu
hnehne
Iraqi Arabicacm
Kabuverdianukea
Kachinkac
Kambakam
Kanurikr
Kimbundukmb
kncknc
Levantine Arabicapc
Luba-Lulualua
Luoluo
Magahimag
Mizolus
Mossimos
Najdi Arabicars
Nuernus
Tamasheqtaq
Tunisian Arabicaeb
Umbunduumb

Python script below uses the list above, consisting of languages without Wikipedia, to check if each one of them is present in the Incubator or not:

import requests

def check_language_existence(language_code):
    url = f"https://incubator.wikimedia.org/w/api.php?action=query&format=json&meta=siteinfo&siprop=languages&formatversion=2"
    response = requests.get(url)
    data = response.json()

    if 'languages' in data['query']:
        languages = data['query']['languages']

        for lang in languages:
            if lang['code'] == language_code:
                return True
    
    return False

language_data = {
    "acq": "acq",
    "ajp": "ajp",
    "Bemba": "bem",
    "Bodo": "brx",
    "Central Atlas Tamazight": "tzm",
    "cjk": "cjk",
    "Dogri": "doi",
    "Dyula": "dyu",
    "hne": "hne",
    "Iraqi Arabic": "acm",
    "Kabuverdianu": "kea",
    "Kachin": "kac",
    "Kamba": "kam",
    "Kanuri": "kr",
    "Kimbundu": "kmb",
    "knc": "knc",
    "Levantine Arabic": "apc",
    "Luba-Lulua": "lua",
    "Luo": "luo",
    "Magahi": "mag",
    "Mizo": "lus",
    "Mossi": "mos",
    "Najdi Arabic": "ars",
    "Nuer": "nus",
    "Tamasheq": "taq",
    "Tunisian Arabic": "aeb",
    "Umbundu": "umb"
}

print(f"| Language name | ISO code |")

for language_name, language_code in language_data.items():
    if check_language_existence(language_code):
        print(f"| {language_name} | {language_code} |")

Output of the script in the comment above includes 8 languages without Wikipedia and present in Incubator, along with their names and ISO codes. Here is the list:

Language nameISO code
Central Atlas Tamazighttzm
Iraqi Arabicacm
Kabuverdianukea
Kanurikr
Magahimag
Mizolus
Mossimos
Tunisian Arabicaeb