# Migrate Airflow (2.5 + PG) in test cluster (an-test-client1001) # 1/ Put the last version of the airflow deb package on apt.wikimedia.org # https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags/-/packages # 2/ Put all running dags to "pause" # With the Airflow UI # And wait for all running tasks to be finished # 3/ Shut down the Airflow service # Should stop the scheduler & the webserver. # The goal is to froze the DB. # 4/ Update the Airflow debian package apt update airflow # 5/ Check the install source /usr/lib/airflow/etc/profile.d/conda.sh /usr/lib/airflow/bin/airflow version # Should return 2.5.0 # 6/ Create an alternative conda env with Airflow 2.1.4 source ~/proxy.sh source /usr/lib/airflow/etc/profile.d/conda.sh conda create --name airflow-2.1.4 conda activate ~/.conda/envs/airflow-2.1.4 conda install pip pip install apache-airflow==2.1.4 psycopg2-binary pyspark==3.1.2 SQLAlchemy~=1.4 airflow version # or ~/.conda/envs/airflow-2.1.4/bin/airflow version conda list | grep "airflow\|pyspark\|chemy" # 7/ Init the DB in postgres with Airflow version 2.1.4 # check your Airflow version & initialize your Airflow home at the same time AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow version # edit the connection sql_alchemy_conn vim ~/airflow-2.1.4/airflow.cfg # sql_alchemy_conn = postgresql://airflow_data_engineering_dev:REPLACEME@an-db1001.eqiad.wmnet:5432/airflow_data_engineering_dev AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow db check AIRFLOW_HOME=~/airflow-2.1.4 ~/.conda/envs/airflow-2.1.4/bin/airflow db init # 8/ Import data from MariaDB # get jdbc drivers for Maria & PG wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.8/mariadb-java-client-3.0.8.jar wget https://repo1.maven.org/maven2/org/postgresql/postgresql/42.5.0/postgresql-42.5.0.jar # Launch the Spark console ~/.conda/envs/airflow-2.1.4/bin/pyspark --jars=mariadb-java-client-3.0.8.jar,postgresql-42.5.0.jar # past the python script progressively in the pyspark console (Set the correct password first) # 9/ Migrate the Airflow PostgresDB to Airflow 2.5.0 # In another terminal, to avoid conda confusion. source /usr/lib/airflow/etc/profile.d/conda.sh conda activate /usr/lib/airflow # Setup an airflow home AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow version # edit airflow.cfg vim ~/airflow-2.5.0/airflow.cfg # sql_alchemy_conn = postgresql://airflow_data_engineering_dev:REPLACEME@an-db1001.eqiad.wmnet:5432/airflow_data_engineering_dev AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow db check AIRFLOW_HOME=~/airflow-2.5.0 /usr/lib/airflow/bin/airflow db upgrade # 10/ Cleanup after migration a useless table. # With psql or your favorite GUI (Datagrip), connect to the PostgreSQL DB. # SQL: drop table _airflow_moved__2_2__dangling__task_instance; # 11/ Set all dags as paused manually in Postgres before launching scheduler # SQL: update dag set is_paused = TRUE where is_active is TRUE and owners <> 'airflow'; # Notice: could be skipped I think, with this new process. # 12/ Deploy the airflow-dags/analytics_test # - merge first: https://gitlab.wikimedia.org/repos/data-engineering/airflow-dags/-/merge_requests/200 # - then scap deploy # 18/ Deploy the puppet configuration change # - and wait for propagation # 19/ Restart the airflow service # If puppet didn't do it. # To get back the scheduler & the webserver. # 20/ Test # - check the configuration output on the UI # - test aqs_hourly # Set start date {"start_date":"2022-05-08"} in Airflow variables # Remove dags run not (success/failed) # check it works & look at the logs # - launch analytics_test/dags/custom_operators_tryout_dag.py