#!/bin/bash

PATH_TO_RDF_SPARK_TOOL_JAR=/path/to/rdf-spark-tools.jar
SNAPSHOT=20230116
OUTPUT=hdfs:///user/pfischer/test_rdf_spark_tools/$SNAPSHOT/rev_map.csv
SPARK3_SUBMIT=spark3-submit

# Production OUTPUT is at hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/$SNAPSHOT/rev_map.csv
# so it can be tested to see if the data is similar
# E.g.: hdfs dfs -du -s -h hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv
# Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
# 873.0 M  hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv
# hdfs dfs -ls  hdfs://analytics-hadoop/wmf/data/discovery/wdqs/entity_revision_map/20230116/rev_map.csv | wc -l
# Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
# 102

$SPARK3_SUBMIT \
        --master yarn \
        --conf spark.dynamicAllocation.maxExecutors=25 \
        --conf spark.yarn.maxAppAttempts=1 \
        --executor-cores 8 \
        --executor-memory 16g \
        --driver-memory 2g \
        --name "SPARK3 TEST: [Search  Airflow  Job]  Import  Wikidata  Ttl:  Gen  Rev  Map" \
        --class org.wikidata.query.rdf.spark.transform.structureddata.dumps.EntityRevisionMapGenerator \
        --queue default \
        --deploy-mode cluster \
        $PATH_TO_RDF_SPARK_TOOL_JAR
        --input-table discovery.wikibase_rdf/date=$SNAPSHOT/wiki=wikidata \
        --output-path $OUTPUT