Page MenuHomePhabricator
Paste P43199

20230120_airflow_migration_test_cluster.sh
ActivePublic

Authored by Antoine_Quhen on Jan 20 2023, 6:09 PM.
# Migrate Airflow (2.5 + PG) in test cluster (an-test-client1001)
# 1/ Put the last version of the airflow deb package on apt.wikimedia.org
# https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags/-/packages
# 2/ Put all running dags to "pause"
# With the Airflow UI
# And wait for all running tasks to be finished
# 3/ Shut down the Airflow service
# Should stop the scheduler & the webserver.
# The goal is to froze the DB.
# 4/ Update the Airflow debian package
apt update airflow
# 5/ Check the install
source /usr/lib/airflow/etc/profile.d/conda.sh
/usr/lib/airflow/bin/airflow version
# Should return 2.5.0
# 6/ Create an alternative conda env with Airflow 2.1.4
source ~/proxy.sh
source /usr/lib/airflow/etc/profile.d/conda.sh
conda create --name airflow-2.1.4
conda activate ~/.conda/envs/airflow-2.1.4
conda install pip
pip install apache-airflow==2.1.4 psycopg2-binary pyspark==3.1.2 SQLAlchemy~=1.4
airflow version
# or ~/.conda/envs/airflow-2.1.4/bin/airflow version
conda list | grep "airflow\|pyspark\|chemy"
# 7/ Init the DB in postgres with Airflow version 2.1.4
# check your Airflow version & initialize your Airflow home at the same time
AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow version
# edit the connection sql_alchemy_conn
vim ~/airflow-2.1.4/airflow.cfg
# sql_alchemy_conn = postgresql://airflow_data_engineering_dev:REPLACEME@an-db1001.eqiad.wmnet:5432/airflow_data_engineering_dev
AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow db check
AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow db init
# 8/ Import data from MariaDB
# get jdbc drivers for Maria & PG
wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.8/mariadb-java-client-3.0.8.jar
wget https://repo1.maven.org/maven2/org/postgresql/postgresql/42.5.0/postgresql-42.5.0.jar
# Launch the Spark console
~/.conda/envs/airflow-2.1.4/bin/pyspark --jars=mariadb-java-client-3.0.8.jar,postgresql-42.5.0.jar
# past the python script progressively in the pyspark console (Set the correct password first)
# 9/ Migrate the Airflow PostgresDB to Airflow 2.5.0
# In another terminal, to avoid conda confusion.
source /usr/lib/airflow/etc/profile.d/conda.sh
conda activate /usr/lib/airflow
# Setup an airflow home
AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow version
# edit airflow.cfg
vim ~/airflow-2.5.0/airflow.cfg
# sql_alchemy_conn = postgresql://airflow_data_engineering_dev:REPLACEME@an-db1001.eqiad.wmnet:5432/airflow_data_engineering_dev
AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow db check
AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow db upgrade
# 10/ Cleanup after migration a useless table.
# With psql or your favorite GUI (Datagrip), connect to the PostgreSQL DB.
# SQL: drop table _airflow_moved__2_2__dangling__task_instance;
# 11/ Set all dags as paused manually in Postgres before launching scheduler
# SQL: update dag set is_paused = TRUE where is_active is TRUE and owners <> 'airflow';
# Notice: could be skipped I think, with this new process.
# 12/ Deploy the airflow-dags/analytics_test
# - merge first: https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags/-/merge_requests/200
# - then scap deploy
# 18/ Deploy the puppet configuration change
# - and wait for propagation
# 19/ Restart the airflow service
# If puppet didn't do it.
# To get back the scheduler & the webserver.
# 20/ Test
# - check the configuration output on the UI
# - test aqs_hourly
# Set start date {"start_date":"2022-05-08"} in Airflow variables
# Remove dags run not (success/failed)
# check it works & look at the logs
# - launch analytics_test/dags/custom_operators_tryout_dag.py