#!/bin/bash PATH_TO_RDF_SPARK_TOOL_JAR=/path/to/rdf-spark-tools.jar SNAPSHOT=20230116 OUTPUT=hdfs:///user/pfischer/test_rdf_spark_tools/$SNAPSHOT/rev_map.csv SPARK3_SUBMIT=spark3-submit # Production OUTPUT is at hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/$SNAPSHOT/rev_map.csv # so it can be tested to see if the data is similar # E.g.: hdfs dfs -du -s -h hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv # Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8 # 873.0 M hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv # hdfs dfs -ls hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv | wc -l # Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8 # 102 $SPARK3_SUBMIT \ --master yarn \ --conf spark.dynamicAllocation.maxExecutors=25 \ --conf spark.yarn.maxAppAttempts=1 \ --executor-cores 8 \ --executor-memory 16g \ --driver-memory 2g \ --name "SPARK3 TEST: [Search Airflow Job] Import Wikidata Ttl: Gen Rev Map" \ --class org.wikidata.query.rdf.spark.transform.structureddata.dumps.EntityRevisionMapGenerator \ --queue default \ --deploy-mode cluster \ $PATH_TO_RDF_SPARK_TOOL_JAR --input-table discovery.wikibase_rdf/date=$SNAPSHOT/wiki=wikidata \ --output-path $OUTPUT