added progs for chapter 5

mahmoudparsian · mahmoudparsian · commit 7c6e27d3febb · 2022-02-10T22:51:21.000-08:00
diff --git a/code/chap05/python/customers.RECORD.FORMAT.txt b/code/chap05/python/customers.RECORD.FORMAT.txt
@@ -0,0 +1,3 @@
+Each record has the following format:
+
+<customer_id><,><year><,><transaction_id><,><transaction_value>
diff --git a/code/chap05/python/customers.txt b/code/chap05/python/customers.txt
@@ -0,0 +1,10 @@
+c1,2019,T0011,20.67
+c1,2019,T0012,12.34
+c1,2019,T0013,44.30
+c1,2018,T0001,20.67
+c1,2018,T0002,12.34
+c1,2018,T0003,44.30
+c2,2019,T0017,744.30
+c2,2019,T0018,820.67
+c2,2018,T0022,182.34
+c2,2018,T0033,494.30
diff --git a/code/chap05/python/partition_data_by_customer_and_year.log b/code/chap05/python/partition_data_by_customer_and_year.log
@@ -0,0 +1,79 @@
+export INPUT_PATH="customers.txt"
+export OUTPUT_PATH="/tmp/partition_demo"
+export PROG="partition_data_by_customer_and_year.py"
+export SPARK_HOME=/book/spark-3.2.0
+$SPARK_HOME/bin/spark-submit  $PROG  $INPUT_PATH  $OUTPUT_PATH
+input_path= customers.txt
+output_path= /tmp/partition_demo
+
+df:
++-----------+----+--------------+-----------------+
+|customer_id|year|transaction_id|transaction_value|
++-----------+----+--------------+-----------------+
+|c1         |2019|T0011         |20.67            |
+|c1         |2019|T0012         |12.34            |
+|c1         |2019|T0013         |44.3             |
+|c1         |2018|T0001         |20.67            |
+|c1         |2018|T0002         |12.34            |
+|c1         |2018|T0003         |44.3             |
+|c2         |2019|T0017         |744.3            |
+|c2         |2019|T0018         |820.67           |
+|c2         |2018|T0022         |182.34           |
+|c2         |2018|T0033         |494.3            |
++-----------+----+--------------+-----------------+
+
+df.schema:
+root
+ |-- customer_id: string (nullable = true)
+ |-- year: integer (nullable = true)
+ |-- transaction_id: string (nullable = true)
+ |-- transaction_value: double (nullable = true)
+
+df2:
++--------------+-----------------+-----------+----+
+|transaction_id|transaction_value|customer_id|year|
++--------------+-----------------+-----------+----+
+|T0011         |20.67            |c1         |2019|
+|T0012         |12.34            |c1         |2019|
+|T0013         |44.3             |c1         |2019|
+|T0001         |20.67            |c1         |2018|
+|T0002         |12.34            |c1         |2018|
+|T0003         |44.3             |c1         |2018|
+|T0017         |744.3            |c2         |2019|
+|T0018         |820.67           |c2         |2019|
+|T0022         |182.34           |c2         |2018|
+|T0033         |494.3            |c2         |2018|
++--------------+-----------------+-----------+----+
+
+df2.schema:
+root
+ |-- transaction_id: string (nullable = true)
+ |-- transaction_value: double (nullable = true)
+ |-- customer_id: string (nullable = true)
+ |-- year: integer (nullable = true)
+
+
+$ ls -1R /tmp/partition_demo/
+_SUCCESS
+customer_id=c1
+customer_id=c2
+
+/tmp/partition_demo//customer_id=c1:
+year=2018
+year=2019
+
+/tmp/partition_demo//customer_id=c1/year=2018:
+part-00000-8905097e-a6d3-4cb7-8b40-879073ec51bc.c000.snappy.parquet
+
+/tmp/partition_demo//customer_id=c1/year=2019:
+part-00000-8905097e-a6d3-4cb7-8b40-879073ec51bc.c000.snappy.parquet
+
+/tmp/partition_demo//customer_id=c2:
+year=2018
+year=2019
+
+/tmp/partition_demo//customer_id=c2/year=2018:
+part-00000-8905097e-a6d3-4cb7-8b40-879073ec51bc.c000.snappy.parquet
+
+/tmp/partition_demo//customer_id=c2/year=2019:
+part-00000-8905097e-a6d3-4cb7-8b40-879073ec51bc.c000.snappy.parquet
diff --git a/code/chap05/python/partition_data_by_customer_and_year.py b/code/chap05/python/partition_data_by_customer_and_year.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+#-----------------------------------------------------
+# 1. Read customer.txt 
+# 2. Create a DataFrame with 4 columns: 
+#    { <customer_id>, 
+#      <year>, 
+#      <transaction_id>, 
+#      <transaction_value> }
+# 3. Partition data by (<customer_id>, <year>)
+#-------------------------------------------------------
+# @author Mahmoud Parsian
+#-------------------------------------------------------
+from __future__ import print_function
+import sys
+from pyspark.sql import SparkSession
+
+# define input path
+input_path = sys.argv[1]
+print("input_path=", input_path)
+
+# define output path for partitioned data
+output_path = sys.argv[2]
+print("output_path=", output_path)
+
+# create a SparkSession object
+spark = SparkSession.builder.getOrCreate()
+       
+
+# create a DataFrame, note that toDF() returns a 
+# new DataFrame with new specified column names
+# columns = ('customer_id', 'year', 'transaction_id', 'transaction_value')
+df = spark.read.option("inferSchema", "true")\
+  .csv(input_path)\
+  .toDF('customer_id', 'year', 'transaction_id', 'transaction_value')
+#
+df.show(truncate=False)
+df.printSchema()
+#
+# partition data by 'customer_id', and then by 'year'
+# each partition will have one or more files
+df.write.partitionBy('customer_id', 'year')\
+  .parquet(output_path)
+
+# read the partitioned data back to another DataFrame
+df2 = spark.read.parquet(output_path)
+df2.show(truncate=False)
+df2.printSchema()
+
+# done!
+spark.stop()
diff --git a/code/chap05/python/partition_data_by_customer_and_year.sh b/code/chap05/python/partition_data_by_customer_and_year.sh
@@ -0,0 +1,13 @@
+#-----------------------------------------------------
+# This is a shell script to run the following program:
+#      partition_data_by_customer_and_year.py
+#-----------------------------------------------------
+# @author Mahmoud Parsian
+#-----------------------------------------------------
+export SPARK_HOME="/book/spark-3.2.0"
+export INPUT_PATH="/book/code/chap05/customers.txt"
+export OUTPUT_PATH="/tmp/partition_demo"
+export SPARK_PROG="/book/code/chap05/partition_data_by_customer_and_year.py"
+#
+# run the PySpark program:
+$SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH $OUTPUT_PATH
diff --git a/code/chap05/python/partition_data_by_customer_and_year_single_file.py b/code/chap05/python/partition_data_by_customer_and_year_single_file.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+#-----------------------------------------------------
+#
+# NOTE:
+#      This solution creates a SINGLE FILE 
+#      per created partition 
+#
+#-----------------------------------------------------
+# 1. Read customer.txt 
+# 2. Create a DataFrame with 4 columns: 
+#    { <customer_id>, 
+#      <year>, 
+#      <transaction_id>, 
+#      <transaction_value> }
+# 3. Partition data by (<customer_id>, <year>)
+#-------------------------------------------------------
+# @author Mahmoud Parsian
+#-------------------------------------------------------
+from __future__ import print_function
+import sys
+from pyspark.sql import SparkSession
+
+# define input path
+input_path = sys.argv[1]
+print("input_path=", input_path)
+
+# define output path for partitioned data
+output_path = sys.argv[2]
+print("output_path=", output_path)
+
+# create a SparkSession object
+spark = SparkSession.builder.getOrCreate()
+       
+
+# create a DataFrame, note that toDF() returns a 
+# new DataFrame with new specified column names
+# columns = ('customer_id', 'year', 'transaction_id', 'transaction_value')
+df = spark.read.option("inferSchema", "true")\
+  .csv(input_path)\
+  .toDF('customer_id', 'year', 'transaction_id', 'transaction_value')
+#
+df.show(truncate=False)
+df.printSchema()
+#
+# partition data by 'customer_id', and then by 'year' 
+# and create a SINFGLE FILE per created partition.
+# DataFrame.repartition('customer_id', 'year') qurantees
+# a single file per partition.
+df.repartition('customer_id', 'year')\
+  .write.partitionBy('customer_id', 'year')\
+  .parquet(output_path)
+
+# read the partitioned data back to another DataFrame
+df2 = spark.read.parquet(output_path)
+df2.show(truncate=False)
+df2.printSchema()
+
+# done!
+spark.stop()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Each record has the following format:`
	`2`	`+`
	`3`	`+<customer_id><,><year><,><transaction_id><,><transaction_value>`