Add shell script to launch cc-pyspark jobs on Hadoop

sebastian-nagel · sebastian-nagel · commit 5d6c1a5f772e · 2025-11-01T18:43:43.000+01:00
(Spark on YARN).
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ This will install v3.5.7 of [the PySpark python package](https://spark.apache.or
 
 Install Spark if (see the [Spark documentation](https://spark.apache.org/docs/latest/) for guidance). Then, ensure that `spark-submit` and `pyspark` are on your `$PATH`, or prepend `$SPARK_HOME/bin` when running eg `$SPARK_HOME/bin/spark-submit`.
 
-> Note: The PySpark package is required if you want to run the tests in `test/`. 
+> Note: The PySpark package and "py4j" are required if you want to run the tests in `test/`. The packages are also included in Spark installations at `$SPARK_HOME/python` resp. `$SPARK_HOME/python/lib/py4j-*-src.zip`.
 
 ## Compatibility and Requirements
 
@@ -159,7 +159,10 @@ As the Common Crawl dataset lives in the Amazon Public Datasets program, you can
 
 3. don't forget to deploy all dependencies in the cluster, see [advanced dependency management](https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management)
 
-4. also the the file `sparkcc.py` needs to be deployed or added as argument `--py-files sparkcc.py` to `spark-submit`. Note: some of the examples require further Python files as dependencies.
+4. also the the file `sparkcc.py` needs to be deployed or added as argument `--py-files sparkcc.py` to `spark-submit`. Note: several of the examples require further Python files as dependencies.
+
+
+The script [run_ccpyspark_job_hadoop.sh](./run_ccpyspark_job_hadoop.sh) shows an example how to run a Spark job on a Hadoop cluster (Spark on YARN). Please, do not forget to adapt this script to your needs.
 
 
 ### Command-line options
diff --git a/run_ccpyspark_job_hadoop.sh b/run_ccpyspark_job_hadoop.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+
+# example shell script to run a cc-pyspark job on a Hadoop cluster (Spark on YARN)
+
+SCRIPT="$1"
+WAREHOUSE="$2"
+
+if [ -z "$SCRIPT" ] || [ -z "$WAREHOUSE" ]; then
+    echo "Usage: $0 <script> <warehouse> <args>..."
+    echo "  Run a cc-pyspark job in Spark/Hadoop cluster"
+    echo
+    echo "Arguments:"
+    echo "  <script>     cc-pyspark job implementation"
+    echo "  <warehouse>  Spark SQL warehouse directory"
+    echo "  <args>...    remaining args are passed to the job"
+    echo
+    echo "Example:"
+    echo "  $0 server_count.py hdfs:///user/max/counts \\"
+    echo "        wat_sample.paths servers"
+    echo
+    echo "Note: don't forget to adapt the number of executors,"
+    echo "      input/output partitions, the memory requirements"
+    echo "      and other parameters at your need!"
+    echo "      Some params can be set per environment variable."
+    exit 1
+fi
+
+# strip SCRIPT and WAREHOUSE from argument list
+shift 2
+
+SPARK_ON_YARN="--master yarn"
+SPARK_HADOOP_OPTS=""
+SPARK_EXTRA_OPTS=""
+
+# defines SPARK_HOME, SPARK_HADOOP_OPTS and HADOOP_CONF_DIR
+. $HOME/workspace/spark/spark_env.sh
+
+NUM_EXECUTORS=${NUM_EXECUTORS:-1}
+EXECUTOR_MEM=${EXECUTOR_MEM:-4g}
+EXECUTOR_CORES=${EXECUTOR_CORES:-2}
+
+# access data via S3
+INPUT_BASE_URL="s3://commoncrawl/"
+
+# temporary directory
+# - must exist on task/compute nodes for buffering data
+# - should provide several GBs of free space to hold temporarily
+#   the downloaded data (WARC, WAT, WET files)
+TMPDIR=/data/0/tmp
+
+export PYSPARK_PYTHON="python"  # or "python3"
+
+# Python dependencies (for simplicity, include all Python files: cc-pyspark/*.py)
+PYFILES=$(ls sparkcc.py sparkcc_fastwarc.py *.py | sort -u | tr '\n' ',')
+
+
+
+set -xe
+
+$SPARK_HOME/bin/spark-submit \
+              $SPARK_ON_YARN \
+              $SPARK_HADOOP_OPTS \
+              --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+              --conf spark.task.maxFailures=5 \
+              --conf spark.executor.memory=$EXECUTOR_MEM \
+              --conf spark.driver.memory=3g \
+              --conf spark.core.connection.ack.wait.timeout=600s \
+              --conf spark.network.timeout=300s \
+              --conf spark.shuffle.io.maxRetries=50 \
+              --conf spark.shuffle.io.retryWait=600s \
+              --conf spark.locality.wait=1s \
+              --conf spark.io.compression.codec=zstd \
+              --conf spark.checkpoint.compress=true \
+              --conf spark.executorEnv.LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native \
+              --num-executors $NUM_EXECUTORS \
+              --executor-cores $EXECUTOR_CORES \
+              --executor-memory $EXECUTOR_MEM \
+              --conf spark.sql.warehouse.dir=$WAREHOUSE \
+              --conf spark.sql.parquet.outputTimestampType=TIMESTAMP_MILLIS \
+              --py-files $PYFILES \
+              $SCRIPT \
+              --input_base_url $INPUT_BASE_URL \
+              --local_temp_dir $TMPDIR \
+              "$@"