Page MenuHomePhabricator

Load language data
Closed, ResolvedPublic

Description

Load the table with language data from CSV files.

Event Timeline

Loaded

SELECT  * 
  FROM ntsako.brief_projects_edited_metrics 
WHERE year=2021

SELECT * 
  FROM ntsako.official_language_metrics

SELECT *
  FROM ntsako.unesco_endangered_lang_metrics
WHERE year=2021
SELECT  * 
  FROM ntsako.brief_projects_edited_metrics 
WHERE year=2021


SELECT *
  FROM ntsako.unesco_endangered_lang_metrics
WHERE year=2021

Take the above to production.

Note that the active projects are those which are edited by at least 1% and 5 or more editors in the region each month on average during

query

WITH
    monthly_avg AS (
        SELECT 
            country_code,
            ROUND(AVG(distinct_editors)) AS average_monthly_acitve_editors,
            wiki_db
        FROM 
            wmf.geoeditors_monthly
        WHERE 
            YEAR(month) = 2021
        GROUP BY
            country_code,
            wiki_db),
    
    monthly_sum AS (
        SELECT
            country_code,
            SUM(average_monthly_acitve_editors) AS total_editors
        FROM
            monthly_avg
        GROUP BY
            country_code),
    
    projects_edited AS (
        SELECT
            avg.country_code,
            average_monthly_acitve_editors,
            ROUND(average_monthly_acitve_editors / total_editors * 100, 2) AS proportion,
            wiki_db
        FROM
            monthly_avg avg
        LEFT JOIN
            monthly_sum sum
            ON avg.country_code = sum.country_code),
    
    brief_projects_edited AS (
        SELECT
            country_code,
            cdc.name AS country_name,
            average_monthly_acitve_editors,
            proportion,
            wiki_db,
            english_name AS project_label,
            language_name AS language
        FROM
            projects_edited pe
        LEFT JOIN
            canonical_data.wikis cdw
            ON pe.wiki_db = cdw.database_code
        LEFT JOIN
            canonical_data.countries cdc
            ON pe.country_code = cdc.iso_code
        WHERE
            average_monthly_acitve_editors >= 5 AND
            proportion >= 1)

SELECT *
FROM brief_projects_edited

the last WHERE statement can be removed for a non-filtered version

Table loaded with the below query so as to use our country meta data table instead of canonical countries table.

WITH
   all_countries AS (
     SELECT DISTINCT *
     FROM
     (
      SELECT '--' AS country_code, 'Unknown' AS name
          UNION ALL
     SELECT country_code_iso_2 AS country_code, canonical_country_name
       FROM ntsako.country_meta_data
     ) AS countries
   ),
    monthly_avg AS (
        SELECT country_code,
               ROUND(AVG(distinct_editors)) AS average_monthly_acitve_editors,
               wiki_db
        FROM wmf.geoeditors_monthly
       WHERE month like  '2021-%'
       GROUP BY country_code, wiki_db
     ),
    monthly_sum AS (
        SELECT country_code,
               SUM(average_monthly_acitve_editors) AS total_editors
          FROM monthly_avg
         GROUP BY country_code
    ),
     projects_edited AS (
        SELECT monthly_avg.country_code,
               monthly_avg.average_monthly_acitve_editors,
               ROUND(monthly_avg.average_monthly_acitve_editors / monthly_sum.total_editors * 100, 2) AS proportion,
               monthly_avg.wiki_db
          FROM monthly_avg
          LEFT JOIN monthly_sum ON (monthly_avg.country_code = monthly_sum.country_code)
        ),
      brief_projects_edited AS (
        SELECT projects_edited.country_code,
               countries.name                                 AS country_name,
               projects_edited.average_monthly_acitve_editors,
               projects_edited.proportion,
               projects_edited.wiki_db,
               wikis.english_name                             AS project_label,
               wikis.language_name                            AS language
          FROM projects_edited
          LEFT JOIN canonical_data.wikis wikis ON (projects_edited.wiki_db = wikis.database_code)
          LEFT JOIN all_countries countries ON (projects_edited.country_code = countries.country_code)
     )
SELECT country_code,
       country_name,
       average_monthly_acitve_editors,
       proportion,
       wiki_db,
       project_label,
       language
  FROM brief_projects_edited

@ntsako That's a good idea. Thanks for the improvements.