Identify language communities without Wikipedia that may or may not have Incubator
Open, MediumPublic
Actions

Assigned To

Authored By

	srishakatux
	Tue, Apr 16, 12:27 AM

Description

This task involves identifying language communities that:

Do not yet have a Wikipedia.
May or may not be in the Incubator.
Lack quality machine translation services.

One approach to accomplish this is to examine all the languages listed on https://translate.wmcloud.org/ and develop an automated solution (a Python or JavaScript script) to identify languages from this list that do not have a Wikipedia or are not present in the Incubator.

This approach aims to facilitate starting with languages already in the Incubator, making it easier to contact administrators for the outreach plan to gather information about languages for Opus MT machine translation models.

Related Objects
Search...

		Status	Subtype	Assigned	Task
		Open		srishakatux	T362581 Develop and implement outreach plan for OPUS-MT
		Open		srishakatux	T362589 Identify language communities without Wikipedia that may or may not have Incubator

Event Timeline

srishakatux triaged this task as Medium priority.Tue, Apr 16, 12:27 AM

srishakatux created this task.

Script below written in Python checks Wikipedia existence for each language listed in the dropdown list in the right section at https://translate.wmcloud.org/ and prints the languages for which Wikipedia doesn't exist:

import requests

def check_wikipedia_language(language, language_name):
    try:
        response = requests.get(f"https://{language}.wikipedia.org", timeout=5)
        if response.status_code == 200:
            return False
        else:
            return True
    except requests.RequestException as e:
        return True

def main():
    languages = {
        "ace": "Achinese", 
        "acq": "acq", 
        "ady": "Adyghe", 
        "af": "Afrikaans", 
        "ajp": "ajp", 
        "ak": "Akan", 
        "sq": "Albanian",
        "am": "Amharic", 
        "ar": "Arabic", 
        "an": "Aragonese", 
        "hy": "Armenian", 
        "frp": "Arpitan", 
        "as": "Assamese",
        "ast": "Asturian", 
        "av": "Avaric", 
        "awa": "Awadhi", 
        "ay": "Aymara", 
        "az": "Azerbaijani", 
        "ban": "Balinese",
        "bm": "Bambara", 
        "bn": "Bangla", 
        "bjn": "Banjar", 
        "ba": "Bashkir", 
        "eu": "Basque", 
        "bar": "Bavarian",
        "be": "Belarusian", 
        "bem": "Bemba", 
        "bh": "Bhojpuri", 
        "bi": "Bislama", 
        "brx": "Bodo", 
        "bs": "Bosnian",
        "br": "Breton", 
        "bug": "Buginese", 
        "bg": "Bulgarian", 
        "my": "Burmese", 
        "zh-yue": "Cantonese", 
        "ca": "Catalan",
        "ceb": "Cebuano", 
        "tzm": "Central Atlas Tamazight", 
        "bcl": "Central Bikol", 
        "ckb": "Central Kurdish",
        "ch": "Chamorro", 
        "ce": "Chechen", 
        "chr": "Cherokee", 
        "zh": "Chinese", 
        "cjk": "cjk", 
        "kw": "Cornish",
        "cr": "Cree", 
        "crh": "Crimean Tatar", 
        "hr": "Croatian", 
        "cs": "Czech", 
        "da": "Danish", 
        "din": "Dinka",
        "doi": "Dogri", 
        "nl": "Dutch", 
        "dyu": "Dyula", 
        "dz": "Dzongkha", 
        "arz": "Egyptian Arabic", 
        "myv": "Erzya",
        "eo": "Esperanto", 
        "et": "Estonian", 
        "ee": "Ewe", 
        "fo": "Faroese", 
        "hif": "Fiji Hindi", 
        "fj": "Fijian",
        "fi": "Finnish", 
        "fon": "Fon", 
        "fr": "French", 
        "fur": "Friulian", 
        "ff": "Fula", 
        "gag": "Gagauz",
        "gl": "Galician", 
        "gan": "Gan", 
        "lg": "Ganda", 
        "ka": "Georgian", 
        "de": "German", 
        "gom": "Goan Konkani",
        "gor": "Gorontalo", 
        "el": "Greek", 
        "gn": "Guarani", 
        "gu": "Gujarati", 
        "guw": "Gun", 
        "ht": "Haitian Creole",
        "ha": "Hausa", 
        "he": "Hebrew", 
        "hi": "Hindi", 
        "hne": "hne", 
        "hu": "Hungarian", 
        "is": "Icelandic", 
        "io": "Ido",
        "ig": "Igbo", 
        "ilo": "Iloko", 
        "id": "Indonesian", 
        "iu": "Inuktitut", 
        "acm": "Iraqi Arabic", 
        "ga": "Irish",
        "it": "Italian", 
        "jam": "Jamaican Creole English", 
        "ja": "Japanese", 
        "jv": "Javanese", 
        "kbd": "Kabardian",
        "kbp": "Kabiye", 
        "kea": "Kabuverdianu", 
        "kab": "Kabyle", 
        "kac": "Kachin", 
        "kl": "Kalaallisut", 
        "xal": "Kalmyk",
        "kam": "Kamba", 
        "kn": "Kannada", 
        "kr": "Kanuri", 
        "kaa": "Kara-Kalpak", 
        "krc": "Karachay-Balkar", 
        "ks": "Kashmiri",
        "kk": "Kazakh", 
        "km": "Khmer", 
        "ki": "Kikuyu", 
        "kmb": "Kimbundu", 
        "rw": "Kinyarwanda", 
        "knc": "knc", 
        "kv": "Komi",
        "koi": "Komi-Permyak", 
        "kg": "Kongo", 
        "ko": "Korean", 
        "ku": "Kurdish", 
        "ky": "Kyrgyz", 
        "lo": "Lao",
        "ltg": "Latgalian", 
        "lv": "Latvian", 
        "apc": "Levantine Arabic", 
        "lij": "Ligurian", 
        "li": "Limburgish",
        "ln": "Lingala", 
        "lt": "Lithuanian", 
        "lmo": "Lombard", 
        "nds": "Low German", 
        "nds-nl": "Low Saxon",
        "lua": "Luba-Lulua", 
        "luo": "Luo", 
        "lb": "Luxembourgish", 
        "mk": "Macedonian", 
        "mad": "Madurese", 
        "mag": "Magahi",
        "mai": "Maithili", 
        "mg": "Malagasy", 
        "ms": "Malay", 
        "ml": "Malayalam", 
        "mt": "Maltese", 
        "mni": "Manipuri",
        "gv": "Manx", 
        "mi": "Māori", 
        "mr": "Marathi", 
        "min": "Minangkabau", 
        "zh-min-nan": "Minnan", 
        "mwl": "Mirandese",
        "lus": "Mizo", 
        "mdf": "Moksha", 
        "mnw": "Mon", 
        "mn": "Mongolian", 
        "ary": "Moroccan Arabic", 
        "mos": "Mossi",
        "ars": "Najdi Arabic", 
        "nv": "Navajo", 
        "ne": "Nepali", 
        "new": "Newari", 
        "nia": "Nias", 
        "se": "Northern Sami",
        "nso": "Northern Sotho", 
        "no": "Norwegian", 
        "nn": "Norwegian Nynorsk", 
        "nus": "Nuer", 
        "ny": "Nyanja",
        "oc": "Occitan", 
        "or": "Odia", 
        "ang": "Old English", 
        "om": "Oromo", 
        "os": "Ossetian", 
        "pam": "Pampanga",
        "pag": "Pangasinan", 
        "pap": "Papiamento", 
        "ps": "Pashto", 
        "fa": "Persian", 
        "pl": "Polish", 
        "pt": "Portuguese",
        "pa": "Punjabi", 
        "qu": "Quechua", 
        "ro": "Romanian", 
        "rm": "Romansh", 
        "rn": "Rundi", 
        "ru": "Russian",
        "sm": "Samoan", 
        "sg": "Sango", 
        "sa": "Sanskrit", 
        "sat": "Santali", 
        "skr": "Saraiki", 
        "sc": "Sardinian",
        "stq": "Saterland Frisian", 
        "gd": "Scottish Gaelic", 
        "sr": "Serbian", 
        "sh": "Serbo-Croatian", 
        "shn": "Shan",
        "sn": "Shona", 
        "scn": "Sicilian", 
        "szl": "Silesian", 
        "simple": "Simple English", 
        "sd": "Sindhi",
        "si": "Sinhala", 
        "sk": "Slovak", 
        "sl": "Slovenian", 
        "so": "Somali", 
        "azb": "Southern Azerbaijani",
        "alt": "Southern Altai", 
        "st": "Southern Sotho", 
        "es": "Spanish", 
        "srn": "Sranan Tongo", 
        "su": "Sundanese",
        "sw": "Swahili", 
        "ss": "Swati", 
        "sv": "Swedish", 
        "tl": "Tagalog", 
        "ty": "Tahitian", 
        "tg": "Tajik",
        "tly": "Talysh", 
        "ta": "Tamil", 
        "taq": "Tamasheq", 
        "tt": "Tatar", 
        "te": "Telugu", 
        "tet": "Tetum",
        "th": "Thai", 
        "bo": "Tibetan", 
        "ti": "Tigrinya", 
        "tpi": "Tok Pisin", 
        "to": "Tongan", 
        "ts": "Tsonga",
        "tn": "Tswana", 
        "tcy": "Tulu", 
        "tum": "Tumbuka", 
        "aeb": "Tunisian Arabic", 
        "tr": "Turkish", 
        "tk": "Turkmen",
        "tyv": "Tuvan", 
        "tw": "Twi", 
        "uk": "Ukrainian", 
        "umb": "Umbundu", 
        "ur": "Urdu", 
        "ug": "Uyghur", 
        "uz": "Uzbek",
        "ve": "Venda", 
        "vec": "Venetian", 
        "vi": "Vietnamese", 
        "wa": "Walloon", 
        "war": "Waray", 
        "cy": "Welsh",
        "wo": "Wolof", 
        "wuu": "Wu Chinese", 
        "xh": "Xhosa", 
        "yi": "Yiddish", 
        "yo": "Yoruba", 
        "zu": "Zulu"
    }
    print(f"| Language name | ISO code |")

    for language, language_name in languages.items():
        if check_wikipedia_language(language, language_name):
            print(f"| {language_name} | {language} |")

if __name__ == "__main__":
    main()

Output of the script in the comment above includes languages without Wikipedia, along with their names and ISO codes. Here is the list:

Language name	ISO code
acq	acq
ajp	ajp
Bemba	bem
Bodo	brx
Central Atlas Tamazight	tzm
cjk	cjk
Dogri	doi
Dyula	dyu
hne	hne
Iraqi Arabic	acm
Kabuverdianu	kea
Kachin	kac
Kamba	kam
Kanuri	kr
Kimbundu	kmb
knc	knc
Levantine Arabic	apc
Luba-Lulua	lua
Luo	luo
Magahi	mag
Mizo	lus
Mossi	mos
Najdi Arabic	ars
Nuer	nus
Tamasheq	taq
Tunisian Arabic	aeb
Umbundu	umb

Python script below uses the list above, consisting of languages without Wikipedia, to check if each one of them is present in the Incubator or not:

import requests

def check_language_existence(language_code):
    url = f"https://incubator.wikimedia.org/w/api.php?action=query&format=json&meta=siteinfo&siprop=languages&formatversion=2"
    response = requests.get(url)
    data = response.json()

    if 'languages' in data['query']:
        languages = data['query']['languages']

        for lang in languages:
            if lang['code'] == language_code:
                return True
    
    return False

language_data = {
    "acq": "acq",
    "ajp": "ajp",
    "Bemba": "bem",
    "Bodo": "brx",
    "Central Atlas Tamazight": "tzm",
    "cjk": "cjk",
    "Dogri": "doi",
    "Dyula": "dyu",
    "hne": "hne",
    "Iraqi Arabic": "acm",
    "Kabuverdianu": "kea",
    "Kachin": "kac",
    "Kamba": "kam",
    "Kanuri": "kr",
    "Kimbundu": "kmb",
    "knc": "knc",
    "Levantine Arabic": "apc",
    "Luba-Lulua": "lua",
    "Luo": "luo",
    "Magahi": "mag",
    "Mizo": "lus",
    "Mossi": "mos",
    "Najdi Arabic": "ars",
    "Nuer": "nus",
    "Tamasheq": "taq",
    "Tunisian Arabic": "aeb",
    "Umbundu": "umb"
}

print(f"| Language name | ISO code |")

for language_name, language_code in language_data.items():
    if check_language_existence(language_code):
        print(f"| {language_name} | {language_code} |")

Output of the script in the comment above includes 8 languages without Wikipedia and present in Incubator, along with their names and ISO codes. Here is the list:

Language name	ISO code
Central Atlas Tamazight	tzm
Iraqi Arabic	acm
Kabuverdianu	kea
Kanuri	kr
Magahi	mag
Mizo	lus
Mossi	mos
Tunisian Arabic	aeb

Identify language communities without Wikipedia that may or may not have IncubatorOpen, MediumPublicActions

Description

Related ObjectsSearch...

Event Timeline

Identify language communities without Wikipedia that may or may not have Incubator
Open, MediumPublic
Actions

Related Objects
Search...