Deployment needs to be synchronized with updates in sqoop script and grouped wikis files (https://gerrit.wikimedia.org/r/#/c/341586/, https://gerrit.wikimedia.org/r/#/c/341586/)
This is the command we use when sqooping manually for a given month YYYY-MM:
export PYTHONPATH=$PYTHONPATH:/srv/deployment/analytics/refinery/python # For PROD: # This commend does NOT check if there's an already running one sudo-u hdfs /srv/deployment/analytics/refinery/bin/download-project-namespace-map -x /wmf/data/raw/mediawiki/project_namespace_map -i prod -V YYYY-MM # This commend normally checks if there's an already running one sudo -u hdfs python3 /srv/deployment/analytics/refinery/bin/sqoop-mediawiki-tables --jdbc-host analytics-store.eqiad.wmnet --output-dir /wmf/data/raw/mediawiki/tables --wiki-file "/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/prod_grouped_wikis.csv" --timestamp YYYYMM01000000 --user research --infra prod --version YYYY-MM --password-file /user/hdfs/mysql-analytics-research-client-pw.txt # For LABS: # This commend does NOT check if there's an already running one sudo-u hdfs /srv/deployment/analytics/refinery/bin/download-project-namespace-map -x /wmf/data/raw/mediawiki/project_namespace_map -i labs -V YYYY-MM # This commend normally checks if there's an already running one sudo -u hdfs python3 /srv/deployment/analytics/refinery/bin/sqoop-mediawiki-tables --jdbc-host labsdb-analytics.eqiad.wmnet --output-dir /wmf/data/raw/mediawiki/tables --wiki-file "/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/labs_grouped_wikis.csv" --timestamp YYYYMM01000000 --user TBC --infra labs --version YYYY-MM --password-file TBC