mahmoudparsian · Jun 3, 2022
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_cross.log
Lines changed: 61 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_cross.log
Lines changed: 61 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_cross.py
Lines changed: 93 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_cross.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_inner.log
Lines changed: 70 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_inner.log
Lines changed: 70 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_inner.py
Lines changed: 91 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_inner.py
Lines changed: 91 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_left.log
Lines changed: 64 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_left.log
Lines changed: 64 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_left.py
Lines changed: 93 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_left.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/dataframe_join_right.py
Lines changed: 65 additions & 0 deletions b/‎code/bonus_chapters/join/python/dataframe_join_right.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎code/bonus_chapters/join/python/inner_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_inner.log
Lines changed: 1 addition & 1 deletion b/‎code/bonus_chapters/join/python/inner_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_inner.log
Lines changed: 1 addition & 1 deletion
diff --git a/‎code/bonus_chapters/join/python/inner_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_inner.py b/‎code/bonus_chapters/join/python/inner_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_inner.py
diff --git a/‎code/bonus_chapters/join/python/left_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_left.log
Lines changed: 1 addition & 1 deletion b/‎code/bonus_chapters/join/python/left_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_left.log
Lines changed: 1 addition & 1 deletion
diff --git a/‎code/bonus_chapters/join/python/left_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_left.py b/‎code/bonus_chapters/join/python/left_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_left.py
diff --git a/‎code/bonus_chapters/join/python/right_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_right.log
Lines changed: 1 addition & 1 deletion b/‎code/bonus_chapters/join/python/right_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_right.log
Lines changed: 1 addition & 1 deletion
diff --git a/‎code/bonus_chapters/join/python/right_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_right.py b/‎code/bonus_chapters/join/python/right_join.py renamed to ‎code/bonus_chapters/join/python/rdd_join_right.py
diff --git a/‎code/chap01/python/dataframe_join_right.py
Lines changed: 0 additions & 5 deletions b/‎code/chap01/python/dataframe_join_right.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎code/chap05/python/customers_with_date.RECORD.FORMAT.txt
Lines changed: 3 additions & 0 deletions b/‎code/chap05/python/customers_with_date.RECORD.FORMAT.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎code/chap05/python/partition_data_as_text_by_year_month.log
Lines changed: 108 additions & 0 deletions b/‎code/chap05/python/partition_data_as_text_by_year_month.log
Lines changed: 108 additions & 0 deletions
@@ -0,0 +1,61 @@
+% ~/spark-3.2.1/bin/spark-submit dataframe_join_cross.py
+
+triplets =  [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
+df.count():  9
+
+df.collect():  [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
++----+---------+---+
+|name|     city|age|
++----+---------+---+
+|alex|     Ames| 20|
+|alex|Sunnyvale| 30|
+|alex|Cupertino| 40|
+|mary|     Ames| 35|
+|mary| Stanford| 45|
+|mary| Campbell| 55|
+|jeff|     Ames| 60|
+|jeff|Sunnyvale| 70|
+|jane|   Austin| 80|
++----+---------+---+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+
+triplets2 =  [('david', 'software'), ('david', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
+df2.count():  5
+df2.collect():  [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
++-----+---------+
+| name|     dept|
++-----+---------+
+|david| software|
+|david| business|
+| mary|marketing|
+| mary|    sales|
+| jane| genomics|
++-----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
+
++----+--------+---+----+---------+
+|name|    city|age|name|     dept|
++----+--------+---+----+---------+
+|jane|  Austin| 80|jane| genomics|
+|mary|    Ames| 35|mary|marketing|
+|mary|    Ames| 35|mary|    sales|
+|mary|Stanford| 45|mary|marketing|
+|mary|Stanford| 45|mary|    sales|
+|mary|Campbell| 55|mary|marketing|
+|mary|Campbell| 55|mary|    sales|
++----+--------+---+----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
+
@@ -0,0 +1,93 @@
+from __future__ import print_function 
+import sys 
+from pyspark.sql import SparkSession 
+#-----------------------------------------------------
+# Apply a join() 
+# source_df.join(other_df, "cross")
+#
+# Input: NONE
+#------------------------------------------------------
+# Input Parameters:
+#    NONE
+#-------------------------------------------------------
+# @author Mahmoud Parsian
+#-------------------------------------------------------
+
+
+#=========================================
+def main():
+
+    # create an instance of SparkSession
+    spark = SparkSession.builder.getOrCreate()
+
+
+    #========================================
+    # join(other, on=None, how=None)
+    #
+    # Joins with another DataFrame, using the given 
+    # join expression.
+    #
+    # Parameters:	
+    #  other - Right side of the join
+    #  on - a string for the join column name, 
+    #       a list of column names, a join 
+    #       expression (Column), or a list of Columns. 
+    #       If on is a string or a list of strings 
+    #       indicating the name of the join column(s), 
+    #       the column(s) must exist on both sides, and 
+    #       this performs an equi-join.
+    #  how - str, default inner. Must be one of: 
+    #        inner, cross, outer, full, full_outer, left, 
+    #        left_outer, right, right_outer, left_semi, 
+    #        and left_anti.
+    #
+    #========================================
+
+    triplets = [("alex","Ames", 20),\
+                ("alex", "Sunnyvale",30),\
+                ("alex", "Cupertino", 40),\
+                ("mary", "Ames", 35),\
+                ("mary", "Stanford", 45),\
+                ("mary", "Campbell", 55),\
+                ("jeff", "Ames", 60),\
+                ("jeff", "Sunnyvale", 70),\
+                ("jane", "Austin", 80)]
+                
+    #
+    print("triplets = ", triplets)
+    df = spark.createDataFrame(triplets, ["name", "city", "age"])
+    print("df.count(): ", df.count())
+    print("df.collect(): ", df.collect())
+    df.show()
+    df.printSchema()
+    #
+    #
+    triplets2 = [("david", "software"),\
+                 ("david", "business"),\
+                 ("mary", "marketing"),\
+                 ("mary", "sales"),\
+                 ("jane", "genomics")]
+                
+    #
+    print("triplets2 = ", triplets2)
+    df2 = spark.createDataFrame(triplets2, ["name", "dept"])
+    print("df2.count(): ", df2.count())
+    print("df2.collect(): ", df2.collect())
+    df2.show()
+    df2.printSchema()
+
+    #-----------------------------------------
+    # df.join(df2)
+    #-----------------------------------------
+    joined = df.join(df2, df.name == df2.name, 'cross')
+    joined.show()
+    joined.printSchema()  
+    
+         
+    # done!
+    spark.stop()
+#end-def
+#====================================
+if __name__ == '__main__':
+    main()
+
@@ -0,0 +1,70 @@
+% ~/spark-3.2.1/bin/spark-submit dataframe_join_inner.py
+
+triplets =  [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
+
+df.count():  9
+
+df.collect():  [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
+
++----+---------+---+
+|name|     city|age|
++----+---------+---+
+|alex|     Ames| 20|
+|alex|Sunnyvale| 30|
+|alex|Cupertino| 40|
+|mary|     Ames| 35|
+|mary| Stanford| 45|
+|mary| Campbell| 55|
+|jeff|     Ames| 60|
+|jeff|Sunnyvale| 70|
+|jane|   Austin| 80|
++----+---------+---+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+
+triplets2 =  [('alex', 'software'), ('alex', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
+
+df2.count():  5
+
+df2.collect():  [Row(name='alex', dept='software'), Row(name='alex', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
++----+---------+
+|name|     dept|
++----+---------+
+|alex| software|
+|alex| business|
+|mary|marketing|
+|mary|    sales|
+|jane| genomics|
++----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
+
++----+---------+---+----+---------+
+|name|     city|age|name|     dept|
++----+---------+---+----+---------+
+|alex|     Ames| 20|alex| software|
+|alex|     Ames| 20|alex| business|
+|alex|Sunnyvale| 30|alex| software|
+|alex|Sunnyvale| 30|alex| business|
+|alex|Cupertino| 40|alex| software|
+|alex|Cupertino| 40|alex| business|
+|jane|   Austin| 80|jane| genomics|
+|mary|     Ames| 35|mary|marketing|
+|mary|     Ames| 35|mary|    sales|
+|mary| Stanford| 45|mary|marketing|
+|mary| Stanford| 45|mary|    sales|
+|mary| Campbell| 55|mary|marketing|
+|mary| Campbell| 55|mary|    sales|
++----+---------+---+----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
@@ -0,0 +1,91 @@
+from __future__ import print_function 
+import sys 
+from pyspark.sql import SparkSession 
+#-----------------------------------------------------
+# Apply a join() 
+# source_df.join(other_df)
+#
+# Input: NONE
+#------------------------------------------------------
+# Input Parameters:
+#    NONE
+#-------------------------------------------------------
+# @author Mahmoud Parsian
+#-------------------------------------------------------
+
+#=========================================
+def main():
+
+    # create an instance of SparkSession
+    spark = SparkSession.builder.getOrCreate()
+
+
+    #========================================
+    # join(other, on=None, how=None)
+    #
+    # Joins with another DataFrame, using the given 
+    # join expression.
+    #
+    # Parameters:	
+    #  other - Right side of the join
+    #  on - a string for the join column name, 
+    #       a list of column names, a join 
+    #       expression (Column), or a list of Columns. 
+    #       If on is a string or a list of strings 
+    #       indicating the name of the join column(s), 
+    #       the column(s) must exist on both sides, and 
+    #       this performs an equi-join.
+    #  how - str, default inner. Must be one of: 
+    #        inner, cross, outer, full, full_outer, left, 
+    #        left_outer, right, right_outer, left_semi, 
+    #        and left_anti.
+    #
+    #========================================
+
+    triplets = [("alex","Ames", 20),\
+                ("alex", "Sunnyvale",30),\
+                ("alex", "Cupertino", 40),\
+                ("mary", "Ames", 35),\
+                ("mary", "Stanford", 45),\
+                ("mary", "Campbell", 55),\
+                ("jeff", "Ames", 60),\
+                ("jeff", "Sunnyvale", 70),\
+                ("jane", "Austin", 80)]
+                
+    #
+    print("triplets = ", triplets)
+    df = spark.createDataFrame(triplets, ["name", "city", "age"])
+    print("df.count(): ", df.count())
+    print("df.collect(): ", df.collect())
+    df.show()
+    df.printSchema()
+    #
+    #
+    triplets2 = [("alex", "software"),\
+                 ("alex", "business"),\
+                 ("mary", "marketing"),\
+                 ("mary", "sales"),\
+                 ("jane", "genomics")]
+                
+    #
+    print("triplets2 = ", triplets2)
+    df2 = spark.createDataFrame(triplets2, ["name", "dept"])
+    print("df2.count(): ", df2.count())
+    print("df2.collect(): ", df2.collect())
+    df2.show()
+    df2.printSchema()
+
+    #-----------------------------------------
+    # df.join(df2)
+    #-----------------------------------------
+    joined = df.join(df2, df.name == df2.name, 'inner')
+    joined.show()
+    joined.printSchema()  
+    
+         
+    # done!
+    spark.stop()
+#end-def
+#===================================
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,64 @@
+% ~/spark-3.2.1/bin/spark-submit dataframe_join_left.py
+
+triplets =  [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
+df.count():  9
+df.collect():  [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
++----+---------+---+
+|name|     city|age|
++----+---------+---+
+|alex|     Ames| 20|
+|alex|Sunnyvale| 30|
+|alex|Cupertino| 40|
+|mary|     Ames| 35|
+|mary| Stanford| 45|
+|mary| Campbell| 55|
+|jeff|     Ames| 60|
+|jeff|Sunnyvale| 70|
+|jane|   Austin| 80|
++----+---------+---+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+
+triplets2 =  [('david', 'software'), ('david', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
+df2.count():  5
+df2.collect():  [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
++-----+---------+
+| name|     dept|
++-----+---------+
+|david| software|
+|david| business|
+| mary|marketing|
+| mary|    sales|
+| jane| genomics|
++-----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
+
++----+---------+---+----+---------+
+|name|     city|age|name|     dept|
++----+---------+---+----+---------+
+|alex|     Ames| 20|null|     null|
+|alex|Sunnyvale| 30|null|     null|
+|alex|Cupertino| 40|null|     null|
+|jane|   Austin| 80|jane| genomics|
+|jeff|     Ames| 60|null|     null|
+|jeff|Sunnyvale| 70|null|     null|
+|mary|     Ames| 35|mary|marketing|
+|mary|     Ames| 35|mary|    sales|
+|mary| Stanford| 45|mary|marketing|
+|mary| Stanford| 45|mary|    sales|
+|mary| Campbell| 55|mary|marketing|
+|mary| Campbell| 55|mary|    sales|
++----+---------+---+----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
@@ -0,0 +1,93 @@
+from __future__ import print_function 
+import sys 
+from pyspark.sql import SparkSession 
+#-----------------------------------------------------
+# Apply a join() 
+# source_df.join(other_df, "left")
+#
+# Input: NONE
+#------------------------------------------------------
+# Input Parameters:
+#    NONE
+#-------------------------------------------------------
+# @author Mahmoud Parsian
+#-------------------------------------------------------
+
+
+
+#=========================================
+def main():
+
+    # create an instance of SparkSession
+    spark = SparkSession.builder.getOrCreate()
+
+
+    #========================================
+    # join(other, on=None, how=None)
+    #
+    # Joins with another DataFrame, using the given 
+    # join expression.
+    #
+    # Parameters:	
+    #  other - Right side of the join
+    #  on - a string for the join column name, 
+    #       a list of column names, a join 
+    #       expression (Column), or a list of Columns. 
+    #       If on is a string or a list of strings 
+    #       indicating the name of the join column(s), 
+    #       the column(s) must exist on both sides, and 
+    #       this performs an equi-join.
+    #  how - str, default inner. Must be one of: 
+    #        inner, cross, outer, full, full_outer, left, 
+    #        left_outer, right, right_outer, left_semi, 
+    #        and left_anti.
+    #
+    #========================================
+
+    triplets = [("alex","Ames", 20),\
+                ("alex", "Sunnyvale",30),\
+                ("alex", "Cupertino", 40),\
+                ("mary", "Ames", 35),\
+                ("mary", "Stanford", 45),\
+                ("mary", "Campbell", 55),\
+                ("jeff", "Ames", 60),\
+                ("jeff", "Sunnyvale", 70),\
+                ("jane", "Austin", 80)]
+                
+    #
+    print("triplets = ", triplets)
+    df = spark.createDataFrame(triplets, ["name", "city", "age"])
+    print("df.count(): ", df.count())
+    print("df.collect(): ", df.collect())
+    df.show()
+    df.printSchema()
+    #
+    #
+    triplets2 = [("david", "software"),\
+                 ("david", "business"),\
+                 ("mary", "marketing"),\
+                 ("mary", "sales"),\
+                 ("jane", "genomics")]
+                
+    #
+    print("triplets2 = ", triplets2)
+    df2 = spark.createDataFrame(triplets2, ["name", "dept"])
+    print("df2.count(): ", df2.count())
+    print("df2.collect(): ", df2.collect())
+    df2.show()
+    df2.printSchema()
+
+    #-----------------------------------------
+    # df.join(df2)
+    #-----------------------------------------
+    joined = df.join(df2, df.name == df2.name, 'left')
+    joined.show()
+    joined.printSchema()  
+    
+         
+    # done!
+    spark.stop()
+#end-def
+#==========================================
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,65 @@
+% ~/spark-3.2.1/bin/spark-submit dataframe_join_right.py
+
+triplets =  [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
+df.count():  9
+df.collect():  [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
++----+---------+---+
+|name|     city|age|
++----+---------+---+
+|alex|     Ames| 20|
+|alex|Sunnyvale| 30|
+|alex|Cupertino| 40|
+|mary|     Ames| 35|
+|mary| Stanford| 45|
+|mary| Campbell| 55|
+|jeff|     Ames| 60|
+|jeff|Sunnyvale| 70|
+|jane|   Austin| 80|
++----+---------+---+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+
+triplets2 =  [('david', 'software'), ('david', 'business'), ('terry', 'coffee'), ('terry', 'hardware'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
+df2.count():  7
+df2.collect():  [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='terry', dept='coffee'), Row(name='terry', dept='hardware'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
++-----+---------+
+| name|     dept|
++-----+---------+
+|david| software|
+|david| business|
+|terry|   coffee|
+|terry| hardware|
+| mary|marketing|
+| mary|    sales|
+| jane| genomics|
++-----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
+
++----+--------+----+-----+---------+
+|name|    city| age| name|     dept|
++----+--------+----+-----+---------+
+|null|    null|null|david| software|
+|null|    null|null|david| business|
+|jane|  Austin|  80| jane| genomics|
+|mary|    Ames|  35| mary|marketing|
+|mary|Stanford|  45| mary|marketing|
+|mary|Campbell|  55| mary|marketing|
+|mary|    Ames|  35| mary|    sales|
+|mary|Stanford|  45| mary|    sales|
+|mary|Campbell|  55| mary|    sales|
+|null|    null|null|terry|   coffee|
+|null|    null|null|terry| hardware|
++----+--------+----+-----+---------+
+
+root
+ |-- name: string (nullable = true)
+ |-- city: string (nullable = true)
+ |-- age: long (nullable = true)
+ |-- name: string (nullable = true)
+ |-- dept: string (nullable = true)
@@ -19,7 +19,7 @@ k6,t7
 
 % A="/tmp/A.txt"
 % B="/tmp/B.txt"
-% ~/spark-3.2.1/bin/spark-submit inner_join.py   $A   $B
+% ~/spark-3.2.1/bin/spark-submit rdd_join_inner.py   $A   $B
 
 rdd_A= 
 [
 
@@ -19,7 +19,7 @@ k6,t7
 
 % A="/tmp/A.txt"
 % B="/tmp/B.txt"
-% ~/spark-3.2.1/bin/spark-submit left_join.py   $A   $B
+% ~/spark-3.2.1/bin/spark-submit rdd_join_left.py   $A   $B
 
 rdd_A= 
 [
 
@@ -19,7 +19,7 @@ k6,t7
 
 % A="/tmp/A.txt"
 % B="/tmp/B.txt"
-% ~/spark-3.2.1/bin/spark-submit right_join.py   $A   $B
+% ~/spark-3.2.1/bin/spark-submit rdd_join_right.py   $A   $B
 
 rdd_A= 
 [
 
@@ -86,11 +86,6 @@ def main():
 
     # done!
     spark.stop()
-    # t3 = (name, city, number)
-    name = t3[0]
-    #city = t3[1]
-    number = int(t3[2])
-    return (name, number)
 #end-def
 #==========================================
 if __name__ == '__main__':
 
@@ -0,0 +1,3 @@
+Each record has the following format:
+
+<customer_id><,><year><,><transaction_id><,><amoun>
@@ -0,0 +1,108 @@
+$SPARK_HOME/bin/spark-submit partition_data_as_text_by_year_month.py customers_with_date.txt /tmp/output
+
+input_path= customers_with_date.txt
+
+output_path= /tmp/output
+
+df::
++-----------+---------+--------------+------+
+|customer_id|date     |transaction_id|amount|
++-----------+---------+--------------+------+
+|c1         |2/9/2019 |T0011         |20    |
+|c1         |2/9/2019 |T0012         |12    |
+|c1         |3/9/2019 |T0013         |30    |
+|c1         |3/9/2019 |T0014         |42    |
+|c1         |4/12/2019|T0023         |48    |
+|c1         |4/12/2018|T0051         |28    |
+|c1         |4/12/2019|T0043         |42    |
+|c1         |4/12/2018|T0091         |29    |
+|c1         |1/3/2018 |T0002         |12    |
+|c1         |4/3/2018 |T0003         |44    |
+|c2         |2/10/2019|T0511         |20    |
+|c2         |2/10/2019|T0612         |17    |
+|c2         |2/9/2019 |T0061         |25    |
+|c2         |2/9/2019 |T0062         |78    |
+|c2         |3/12/2019|T0513         |67    |
+|c2         |3/12/2019|T0014         |42    |
+|c2         |4/10/2019|T0023         |48    |
+|c2         |4/10/2018|T0051         |28    |
+|c2         |4/12/2019|T0043         |42    |
+|c2         |4/12/2018|T0091         |29    |
++-----------+---------+--------------+------+
+only showing top 20 rows
+
+root
+ |-- customer_id: string (nullable = true)
+ |-- date: string (nullable = true)
+ |-- transaction_id: string (nullable = true)
+ |-- amount: string (nullable = true)
+
+df2::
++-----------+---------+--------------+------+----+-----+
+|customer_id|date     |transaction_id|amount|year|month|
++-----------+---------+--------------+------+----+-----+
+|c1         |2/9/2019 |T0011         |20    |2019|9    |
+|c1         |2/9/2019 |T0012         |12    |2019|9    |
+|c1         |3/9/2019 |T0013         |30    |2019|9    |
+|c1         |3/9/2019 |T0014         |42    |2019|9    |
+|c1         |4/12/2019|T0023         |48    |2019|12   |
+|c1         |4/12/2018|T0051         |28    |2018|12   |
+|c1         |4/12/2019|T0043         |42    |2019|12   |
+|c1         |4/12/2018|T0091         |29    |2018|12   |
+|c1         |1/3/2018 |T0002         |12    |2018|3    |
+|c1         |4/3/2018 |T0003         |44    |2018|3    |
+|c2         |2/10/2019|T0511         |20    |2019|10   |
+|c2         |2/10/2019|T0612         |17    |2019|10   |
+|c2         |2/9/2019 |T0061         |25    |2019|9    |
+|c2         |2/9/2019 |T0062         |78    |2019|9    |
+|c2         |3/12/2019|T0513         |67    |2019|12   |
+|c2         |3/12/2019|T0014         |42    |2019|12   |
+|c2         |4/10/2019|T0023         |48    |2019|10   |
+|c2         |4/10/2018|T0051         |28    |2018|10   |
+|c2         |4/12/2019|T0043         |42    |2019|12   |
+|c2         |4/12/2018|T0091         |29    |2018|12   |
++-----------+---------+--------------+------+----+-----+
+only showing top 20 rows
+
+root
+ |-- customer_id: string (nullable = true)
+ |-- date: string (nullable = true)
+ |-- transaction_id: string (nullable = true)
+ |-- amount: string (nullable = true)
+ |-- year: integer (nullable = true)
+ |-- month: integer (nullable = true)
+
+df3::
++-----------+---------+--------------+------+----+-----+
+|customer_id|date     |transaction_id|amount|year|month|
++-----------+---------+--------------+------+----+-----+
+|c1         |4/12/2019|T0023         |48    |2019|12   |
+|c1         |4/12/2019|T0043         |42    |2019|12   |
+|c2         |3/12/2019|T0513         |67    |2019|12   |
+|c2         |3/12/2019|T0014         |42    |2019|12   |
+|c2         |4/12/2019|T0043         |42    |2019|12   |
+|c1         |4/12/2018|T0051         |28    |2018|12   |
+|c1         |4/12/2018|T0091         |29    |2018|12   |
+|c2         |4/12/2018|T0091         |29    |2018|12   |
+|c1         |2/9/2019 |T0011         |20    |2019|9    |
+|c1         |2/9/2019 |T0012         |12    |2019|9    |
+|c1         |3/9/2019 |T0013         |30    |2019|9    |
+|c1         |3/9/2019 |T0014         |42    |2019|9    |
+|c2         |2/9/2019 |T0061         |25    |2019|9    |
+|c2         |2/9/2019 |T0062         |78    |2019|9    |
+|c2         |2/10/2019|T0511         |20    |2019|10   |
+|c2         |2/10/2019|T0612         |17    |2019|10   |
+|c2         |4/10/2019|T0023         |48    |2019|10   |
+|c1         |1/3/2018 |T0002         |12    |2018|3    |
+|c1         |4/3/2018 |T0003         |44    |2018|3    |
+|c2         |1/9/2018 |T0002         |12    |2018|9    |
++-----------+---------+--------------+------+----+-----+
+only showing top 20 rows
+
+root
+ |-- customer_id: string (nullable = true)
+ |-- date: string (nullable = true)
+ |-- transaction_id: string (nullable = true)
+ |-- amount: string (nullable = true)
+ |-- year: integer (nullable = true)
+ |-- month: integer (nullable = true)
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ k6,t7`
`19`	`19`
`20`	`20`	`% A="/tmp/A.txt"`
`21`	`21`	`% B="/tmp/B.txt"`
`22`		`-% ~/spark-3.2.1/bin/spark-submit inner_join.py $A $B`
	`22`	`+% ~/spark-3.2.1/bin/spark-submit rdd_join_inner.py $A $B`
`23`	`23`
`24`	`24`	`rdd_A=`
`25`	`25`	`[`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Each record has the following format:`
	`2`	`+`
	`3`	`+<customer_id><,><year><,><transaction_id><,><amoun>`