Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 72f8ccb

Browse files
committedJun 3, 2022
added joins for RDDs and DataFrames
1 parent 5862112 commit 72f8ccb

16 files changed

+651
-8
lines changed
 
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
% ~/spark-3.2.1/bin/spark-submit dataframe_join_cross.py
2+
3+
triplets = [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
4+
df.count(): 9
5+
6+
df.collect(): [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
7+
+----+---------+---+
8+
|name| city|age|
9+
+----+---------+---+
10+
|alex| Ames| 20|
11+
|alex|Sunnyvale| 30|
12+
|alex|Cupertino| 40|
13+
|mary| Ames| 35|
14+
|mary| Stanford| 45|
15+
|mary| Campbell| 55|
16+
|jeff| Ames| 60|
17+
|jeff|Sunnyvale| 70|
18+
|jane| Austin| 80|
19+
+----+---------+---+
20+
21+
root
22+
|-- name: string (nullable = true)
23+
|-- city: string (nullable = true)
24+
|-- age: long (nullable = true)
25+
26+
triplets2 = [('david', 'software'), ('david', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
27+
df2.count(): 5
28+
df2.collect(): [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
29+
+-----+---------+
30+
| name| dept|
31+
+-----+---------+
32+
|david| software|
33+
|david| business|
34+
| mary|marketing|
35+
| mary| sales|
36+
| jane| genomics|
37+
+-----+---------+
38+
39+
root
40+
|-- name: string (nullable = true)
41+
|-- dept: string (nullable = true)
42+
43+
+----+--------+---+----+---------+
44+
|name| city|age|name| dept|
45+
+----+--------+---+----+---------+
46+
|jane| Austin| 80|jane| genomics|
47+
|mary| Ames| 35|mary|marketing|
48+
|mary| Ames| 35|mary| sales|
49+
|mary|Stanford| 45|mary|marketing|
50+
|mary|Stanford| 45|mary| sales|
51+
|mary|Campbell| 55|mary|marketing|
52+
|mary|Campbell| 55|mary| sales|
53+
+----+--------+---+----+---------+
54+
55+
root
56+
|-- name: string (nullable = true)
57+
|-- city: string (nullable = true)
58+
|-- age: long (nullable = true)
59+
|-- name: string (nullable = true)
60+
|-- dept: string (nullable = true)
61+
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from __future__ import print_function
2+
import sys
3+
from pyspark.sql import SparkSession
4+
#-----------------------------------------------------
5+
# Apply a join()
6+
# source_df.join(other_df, "cross")
7+
#
8+
# Input: NONE
9+
#------------------------------------------------------
10+
# Input Parameters:
11+
# NONE
12+
#-------------------------------------------------------
13+
# @author Mahmoud Parsian
14+
#-------------------------------------------------------
15+
16+
17+
#=========================================
18+
def main():
19+
20+
# create an instance of SparkSession
21+
spark = SparkSession.builder.getOrCreate()
22+
23+
24+
#========================================
25+
# join(other, on=None, how=None)
26+
#
27+
# Joins with another DataFrame, using the given
28+
# join expression.
29+
#
30+
# Parameters:
31+
# other - Right side of the join
32+
# on - a string for the join column name,
33+
# a list of column names, a join
34+
# expression (Column), or a list of Columns.
35+
# If on is a string or a list of strings
36+
# indicating the name of the join column(s),
37+
# the column(s) must exist on both sides, and
38+
# this performs an equi-join.
39+
# how - str, default inner. Must be one of:
40+
# inner, cross, outer, full, full_outer, left,
41+
# left_outer, right, right_outer, left_semi,
42+
# and left_anti.
43+
#
44+
#========================================
45+
46+
triplets = [("alex","Ames", 20),\
47+
("alex", "Sunnyvale",30),\
48+
("alex", "Cupertino", 40),\
49+
("mary", "Ames", 35),\
50+
("mary", "Stanford", 45),\
51+
("mary", "Campbell", 55),\
52+
("jeff", "Ames", 60),\
53+
("jeff", "Sunnyvale", 70),\
54+
("jane", "Austin", 80)]
55+
56+
#
57+
print("triplets = ", triplets)
58+
df = spark.createDataFrame(triplets, ["name", "city", "age"])
59+
print("df.count(): ", df.count())
60+
print("df.collect(): ", df.collect())
61+
df.show()
62+
df.printSchema()
63+
#
64+
#
65+
triplets2 = [("david", "software"),\
66+
("david", "business"),\
67+
("mary", "marketing"),\
68+
("mary", "sales"),\
69+
("jane", "genomics")]
70+
71+
#
72+
print("triplets2 = ", triplets2)
73+
df2 = spark.createDataFrame(triplets2, ["name", "dept"])
74+
print("df2.count(): ", df2.count())
75+
print("df2.collect(): ", df2.collect())
76+
df2.show()
77+
df2.printSchema()
78+
79+
#-----------------------------------------
80+
# df.join(df2)
81+
#-----------------------------------------
82+
joined = df.join(df2, df.name == df2.name, 'cross')
83+
joined.show()
84+
joined.printSchema()
85+
86+
87+
# done!
88+
spark.stop()
89+
#end-def
90+
#====================================
91+
if __name__ == '__main__':
92+
main()
93+
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
% ~/spark-3.2.1/bin/spark-submit dataframe_join_inner.py
2+
3+
triplets = [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
4+
5+
df.count(): 9
6+
7+
df.collect(): [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
8+
9+
+----+---------+---+
10+
|name| city|age|
11+
+----+---------+---+
12+
|alex| Ames| 20|
13+
|alex|Sunnyvale| 30|
14+
|alex|Cupertino| 40|
15+
|mary| Ames| 35|
16+
|mary| Stanford| 45|
17+
|mary| Campbell| 55|
18+
|jeff| Ames| 60|
19+
|jeff|Sunnyvale| 70|
20+
|jane| Austin| 80|
21+
+----+---------+---+
22+
23+
root
24+
|-- name: string (nullable = true)
25+
|-- city: string (nullable = true)
26+
|-- age: long (nullable = true)
27+
28+
triplets2 = [('alex', 'software'), ('alex', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
29+
30+
df2.count(): 5
31+
32+
df2.collect(): [Row(name='alex', dept='software'), Row(name='alex', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
33+
+----+---------+
34+
|name| dept|
35+
+----+---------+
36+
|alex| software|
37+
|alex| business|
38+
|mary|marketing|
39+
|mary| sales|
40+
|jane| genomics|
41+
+----+---------+
42+
43+
root
44+
|-- name: string (nullable = true)
45+
|-- dept: string (nullable = true)
46+
47+
+----+---------+---+----+---------+
48+
|name| city|age|name| dept|
49+
+----+---------+---+----+---------+
50+
|alex| Ames| 20|alex| software|
51+
|alex| Ames| 20|alex| business|
52+
|alex|Sunnyvale| 30|alex| software|
53+
|alex|Sunnyvale| 30|alex| business|
54+
|alex|Cupertino| 40|alex| software|
55+
|alex|Cupertino| 40|alex| business|
56+
|jane| Austin| 80|jane| genomics|
57+
|mary| Ames| 35|mary|marketing|
58+
|mary| Ames| 35|mary| sales|
59+
|mary| Stanford| 45|mary|marketing|
60+
|mary| Stanford| 45|mary| sales|
61+
|mary| Campbell| 55|mary|marketing|
62+
|mary| Campbell| 55|mary| sales|
63+
+----+---------+---+----+---------+
64+
65+
root
66+
|-- name: string (nullable = true)
67+
|-- city: string (nullable = true)
68+
|-- age: long (nullable = true)
69+
|-- name: string (nullable = true)
70+
|-- dept: string (nullable = true)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from __future__ import print_function
2+
import sys
3+
from pyspark.sql import SparkSession
4+
#-----------------------------------------------------
5+
# Apply a join()
6+
# source_df.join(other_df)
7+
#
8+
# Input: NONE
9+
#------------------------------------------------------
10+
# Input Parameters:
11+
# NONE
12+
#-------------------------------------------------------
13+
# @author Mahmoud Parsian
14+
#-------------------------------------------------------
15+
16+
#=========================================
17+
def main():
18+
19+
# create an instance of SparkSession
20+
spark = SparkSession.builder.getOrCreate()
21+
22+
23+
#========================================
24+
# join(other, on=None, how=None)
25+
#
26+
# Joins with another DataFrame, using the given
27+
# join expression.
28+
#
29+
# Parameters:
30+
# other - Right side of the join
31+
# on - a string for the join column name,
32+
# a list of column names, a join
33+
# expression (Column), or a list of Columns.
34+
# If on is a string or a list of strings
35+
# indicating the name of the join column(s),
36+
# the column(s) must exist on both sides, and
37+
# this performs an equi-join.
38+
# how - str, default inner. Must be one of:
39+
# inner, cross, outer, full, full_outer, left,
40+
# left_outer, right, right_outer, left_semi,
41+
# and left_anti.
42+
#
43+
#========================================
44+
45+
triplets = [("alex","Ames", 20),\
46+
("alex", "Sunnyvale",30),\
47+
("alex", "Cupertino", 40),\
48+
("mary", "Ames", 35),\
49+
("mary", "Stanford", 45),\
50+
("mary", "Campbell", 55),\
51+
("jeff", "Ames", 60),\
52+
("jeff", "Sunnyvale", 70),\
53+
("jane", "Austin", 80)]
54+
55+
#
56+
print("triplets = ", triplets)
57+
df = spark.createDataFrame(triplets, ["name", "city", "age"])
58+
print("df.count(): ", df.count())
59+
print("df.collect(): ", df.collect())
60+
df.show()
61+
df.printSchema()
62+
#
63+
#
64+
triplets2 = [("alex", "software"),\
65+
("alex", "business"),\
66+
("mary", "marketing"),\
67+
("mary", "sales"),\
68+
("jane", "genomics")]
69+
70+
#
71+
print("triplets2 = ", triplets2)
72+
df2 = spark.createDataFrame(triplets2, ["name", "dept"])
73+
print("df2.count(): ", df2.count())
74+
print("df2.collect(): ", df2.collect())
75+
df2.show()
76+
df2.printSchema()
77+
78+
#-----------------------------------------
79+
# df.join(df2)
80+
#-----------------------------------------
81+
joined = df.join(df2, df.name == df2.name, 'inner')
82+
joined.show()
83+
joined.printSchema()
84+
85+
86+
# done!
87+
spark.stop()
88+
#end-def
89+
#===================================
90+
if __name__ == '__main__':
91+
main()
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
% ~/spark-3.2.1/bin/spark-submit dataframe_join_left.py
2+
3+
triplets = [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
4+
df.count(): 9
5+
df.collect(): [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
6+
+----+---------+---+
7+
|name| city|age|
8+
+----+---------+---+
9+
|alex| Ames| 20|
10+
|alex|Sunnyvale| 30|
11+
|alex|Cupertino| 40|
12+
|mary| Ames| 35|
13+
|mary| Stanford| 45|
14+
|mary| Campbell| 55|
15+
|jeff| Ames| 60|
16+
|jeff|Sunnyvale| 70|
17+
|jane| Austin| 80|
18+
+----+---------+---+
19+
20+
root
21+
|-- name: string (nullable = true)
22+
|-- city: string (nullable = true)
23+
|-- age: long (nullable = true)
24+
25+
triplets2 = [('david', 'software'), ('david', 'business'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
26+
df2.count(): 5
27+
df2.collect(): [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
28+
+-----+---------+
29+
| name| dept|
30+
+-----+---------+
31+
|david| software|
32+
|david| business|
33+
| mary|marketing|
34+
| mary| sales|
35+
| jane| genomics|
36+
+-----+---------+
37+
38+
root
39+
|-- name: string (nullable = true)
40+
|-- dept: string (nullable = true)
41+
42+
+----+---------+---+----+---------+
43+
|name| city|age|name| dept|
44+
+----+---------+---+----+---------+
45+
|alex| Ames| 20|null| null|
46+
|alex|Sunnyvale| 30|null| null|
47+
|alex|Cupertino| 40|null| null|
48+
|jane| Austin| 80|jane| genomics|
49+
|jeff| Ames| 60|null| null|
50+
|jeff|Sunnyvale| 70|null| null|
51+
|mary| Ames| 35|mary|marketing|
52+
|mary| Ames| 35|mary| sales|
53+
|mary| Stanford| 45|mary|marketing|
54+
|mary| Stanford| 45|mary| sales|
55+
|mary| Campbell| 55|mary|marketing|
56+
|mary| Campbell| 55|mary| sales|
57+
+----+---------+---+----+---------+
58+
59+
root
60+
|-- name: string (nullable = true)
61+
|-- city: string (nullable = true)
62+
|-- age: long (nullable = true)
63+
|-- name: string (nullable = true)
64+
|-- dept: string (nullable = true)
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from __future__ import print_function
2+
import sys
3+
from pyspark.sql import SparkSession
4+
#-----------------------------------------------------
5+
# Apply a join()
6+
# source_df.join(other_df, "left")
7+
#
8+
# Input: NONE
9+
#------------------------------------------------------
10+
# Input Parameters:
11+
# NONE
12+
#-------------------------------------------------------
13+
# @author Mahmoud Parsian
14+
#-------------------------------------------------------
15+
16+
17+
18+
#=========================================
19+
def main():
20+
21+
# create an instance of SparkSession
22+
spark = SparkSession.builder.getOrCreate()
23+
24+
25+
#========================================
26+
# join(other, on=None, how=None)
27+
#
28+
# Joins with another DataFrame, using the given
29+
# join expression.
30+
#
31+
# Parameters:
32+
# other - Right side of the join
33+
# on - a string for the join column name,
34+
# a list of column names, a join
35+
# expression (Column), or a list of Columns.
36+
# If on is a string or a list of strings
37+
# indicating the name of the join column(s),
38+
# the column(s) must exist on both sides, and
39+
# this performs an equi-join.
40+
# how - str, default inner. Must be one of:
41+
# inner, cross, outer, full, full_outer, left,
42+
# left_outer, right, right_outer, left_semi,
43+
# and left_anti.
44+
#
45+
#========================================
46+
47+
triplets = [("alex","Ames", 20),\
48+
("alex", "Sunnyvale",30),\
49+
("alex", "Cupertino", 40),\
50+
("mary", "Ames", 35),\
51+
("mary", "Stanford", 45),\
52+
("mary", "Campbell", 55),\
53+
("jeff", "Ames", 60),\
54+
("jeff", "Sunnyvale", 70),\
55+
("jane", "Austin", 80)]
56+
57+
#
58+
print("triplets = ", triplets)
59+
df = spark.createDataFrame(triplets, ["name", "city", "age"])
60+
print("df.count(): ", df.count())
61+
print("df.collect(): ", df.collect())
62+
df.show()
63+
df.printSchema()
64+
#
65+
#
66+
triplets2 = [("david", "software"),\
67+
("david", "business"),\
68+
("mary", "marketing"),\
69+
("mary", "sales"),\
70+
("jane", "genomics")]
71+
72+
#
73+
print("triplets2 = ", triplets2)
74+
df2 = spark.createDataFrame(triplets2, ["name", "dept"])
75+
print("df2.count(): ", df2.count())
76+
print("df2.collect(): ", df2.collect())
77+
df2.show()
78+
df2.printSchema()
79+
80+
#-----------------------------------------
81+
# df.join(df2)
82+
#-----------------------------------------
83+
joined = df.join(df2, df.name == df2.name, 'left')
84+
joined.show()
85+
joined.printSchema()
86+
87+
88+
# done!
89+
spark.stop()
90+
#end-def
91+
#==========================================
92+
if __name__ == '__main__':
93+
main()
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
% ~/spark-3.2.1/bin/spark-submit dataframe_join_right.py
2+
3+
triplets = [('alex', 'Ames', 20), ('alex', 'Sunnyvale', 30), ('alex', 'Cupertino', 40), ('mary', 'Ames', 35), ('mary', 'Stanford', 45), ('mary', 'Campbell', 55), ('jeff', 'Ames', 60), ('jeff', 'Sunnyvale', 70), ('jane', 'Austin', 80)]
4+
df.count(): 9
5+
df.collect(): [Row(name='alex', city='Ames', age=20), Row(name='alex', city='Sunnyvale', age=30), Row(name='alex', city='Cupertino', age=40), Row(name='mary', city='Ames', age=35), Row(name='mary', city='Stanford', age=45), Row(name='mary', city='Campbell', age=55), Row(name='jeff', city='Ames', age=60), Row(name='jeff', city='Sunnyvale', age=70), Row(name='jane', city='Austin', age=80)]
6+
+----+---------+---+
7+
|name| city|age|
8+
+----+---------+---+
9+
|alex| Ames| 20|
10+
|alex|Sunnyvale| 30|
11+
|alex|Cupertino| 40|
12+
|mary| Ames| 35|
13+
|mary| Stanford| 45|
14+
|mary| Campbell| 55|
15+
|jeff| Ames| 60|
16+
|jeff|Sunnyvale| 70|
17+
|jane| Austin| 80|
18+
+----+---------+---+
19+
20+
root
21+
|-- name: string (nullable = true)
22+
|-- city: string (nullable = true)
23+
|-- age: long (nullable = true)
24+
25+
triplets2 = [('david', 'software'), ('david', 'business'), ('terry', 'coffee'), ('terry', 'hardware'), ('mary', 'marketing'), ('mary', 'sales'), ('jane', 'genomics')]
26+
df2.count(): 7
27+
df2.collect(): [Row(name='david', dept='software'), Row(name='david', dept='business'), Row(name='terry', dept='coffee'), Row(name='terry', dept='hardware'), Row(name='mary', dept='marketing'), Row(name='mary', dept='sales'), Row(name='jane', dept='genomics')]
28+
+-----+---------+
29+
| name| dept|
30+
+-----+---------+
31+
|david| software|
32+
|david| business|
33+
|terry| coffee|
34+
|terry| hardware|
35+
| mary|marketing|
36+
| mary| sales|
37+
| jane| genomics|
38+
+-----+---------+
39+
40+
root
41+
|-- name: string (nullable = true)
42+
|-- dept: string (nullable = true)
43+
44+
+----+--------+----+-----+---------+
45+
|name| city| age| name| dept|
46+
+----+--------+----+-----+---------+
47+
|null| null|null|david| software|
48+
|null| null|null|david| business|
49+
|jane| Austin| 80| jane| genomics|
50+
|mary| Ames| 35| mary|marketing|
51+
|mary|Stanford| 45| mary|marketing|
52+
|mary|Campbell| 55| mary|marketing|
53+
|mary| Ames| 35| mary| sales|
54+
|mary|Stanford| 45| mary| sales|
55+
|mary|Campbell| 55| mary| sales|
56+
|null| null|null|terry| coffee|
57+
|null| null|null|terry| hardware|
58+
+----+--------+----+-----+---------+
59+
60+
root
61+
|-- name: string (nullable = true)
62+
|-- city: string (nullable = true)
63+
|-- age: long (nullable = true)
64+
|-- name: string (nullable = true)
65+
|-- dept: string (nullable = true)

‎code/bonus_chapters/join/python/inner_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_inner.log

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ k6,t7
1919

2020
% A="/tmp/A.txt"
2121
% B="/tmp/B.txt"
22-
% ~/spark-3.2.1/bin/spark-submit inner_join.py $A $B
22+
% ~/spark-3.2.1/bin/spark-submit rdd_join_inner.py $A $B
2323

2424
rdd_A=
2525
[

‎code/bonus_chapters/join/python/left_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_left.log

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ k6,t7
1919

2020
% A="/tmp/A.txt"
2121
% B="/tmp/B.txt"
22-
% ~/spark-3.2.1/bin/spark-submit left_join.py $A $B
22+
% ~/spark-3.2.1/bin/spark-submit rdd_join_left.py $A $B
2323

2424
rdd_A=
2525
[

‎code/bonus_chapters/join/python/right_join.log renamed to ‎code/bonus_chapters/join/python/rdd_join_right.log

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ k6,t7
1919

2020
% A="/tmp/A.txt"
2121
% B="/tmp/B.txt"
22-
% ~/spark-3.2.1/bin/spark-submit right_join.py $A $B
22+
% ~/spark-3.2.1/bin/spark-submit rdd_join_right.py $A $B
2323

2424
rdd_A=
2525
[

‎code/chap01/python/dataframe_join_right.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,6 @@ def main():
8686

8787
# done!
8888
spark.stop()
89-
# t3 = (name, city, number)
90-
name = t3[0]
91-
#city = t3[1]
92-
number = int(t3[2])
93-
return (name, number)
9489
#end-def
9590
#==========================================
9691
if __name__ == '__main__':
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Each record has the following format:
2+
3+
<customer_id><,><year><,><transaction_id><,><amoun>
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
$SPARK_HOME/bin/spark-submit partition_data_as_text_by_year_month.py customers_with_date.txt /tmp/output
2+
3+
input_path= customers_with_date.txt
4+
5+
output_path= /tmp/output
6+
7+
df::
8+
+-----------+---------+--------------+------+
9+
|customer_id|date |transaction_id|amount|
10+
+-----------+---------+--------------+------+
11+
|c1 |2/9/2019 |T0011 |20 |
12+
|c1 |2/9/2019 |T0012 |12 |
13+
|c1 |3/9/2019 |T0013 |30 |
14+
|c1 |3/9/2019 |T0014 |42 |
15+
|c1 |4/12/2019|T0023 |48 |
16+
|c1 |4/12/2018|T0051 |28 |
17+
|c1 |4/12/2019|T0043 |42 |
18+
|c1 |4/12/2018|T0091 |29 |
19+
|c1 |1/3/2018 |T0002 |12 |
20+
|c1 |4/3/2018 |T0003 |44 |
21+
|c2 |2/10/2019|T0511 |20 |
22+
|c2 |2/10/2019|T0612 |17 |
23+
|c2 |2/9/2019 |T0061 |25 |
24+
|c2 |2/9/2019 |T0062 |78 |
25+
|c2 |3/12/2019|T0513 |67 |
26+
|c2 |3/12/2019|T0014 |42 |
27+
|c2 |4/10/2019|T0023 |48 |
28+
|c2 |4/10/2018|T0051 |28 |
29+
|c2 |4/12/2019|T0043 |42 |
30+
|c2 |4/12/2018|T0091 |29 |
31+
+-----------+---------+--------------+------+
32+
only showing top 20 rows
33+
34+
root
35+
|-- customer_id: string (nullable = true)
36+
|-- date: string (nullable = true)
37+
|-- transaction_id: string (nullable = true)
38+
|-- amount: string (nullable = true)
39+
40+
df2::
41+
+-----------+---------+--------------+------+----+-----+
42+
|customer_id|date |transaction_id|amount|year|month|
43+
+-----------+---------+--------------+------+----+-----+
44+
|c1 |2/9/2019 |T0011 |20 |2019|9 |
45+
|c1 |2/9/2019 |T0012 |12 |2019|9 |
46+
|c1 |3/9/2019 |T0013 |30 |2019|9 |
47+
|c1 |3/9/2019 |T0014 |42 |2019|9 |
48+
|c1 |4/12/2019|T0023 |48 |2019|12 |
49+
|c1 |4/12/2018|T0051 |28 |2018|12 |
50+
|c1 |4/12/2019|T0043 |42 |2019|12 |
51+
|c1 |4/12/2018|T0091 |29 |2018|12 |
52+
|c1 |1/3/2018 |T0002 |12 |2018|3 |
53+
|c1 |4/3/2018 |T0003 |44 |2018|3 |
54+
|c2 |2/10/2019|T0511 |20 |2019|10 |
55+
|c2 |2/10/2019|T0612 |17 |2019|10 |
56+
|c2 |2/9/2019 |T0061 |25 |2019|9 |
57+
|c2 |2/9/2019 |T0062 |78 |2019|9 |
58+
|c2 |3/12/2019|T0513 |67 |2019|12 |
59+
|c2 |3/12/2019|T0014 |42 |2019|12 |
60+
|c2 |4/10/2019|T0023 |48 |2019|10 |
61+
|c2 |4/10/2018|T0051 |28 |2018|10 |
62+
|c2 |4/12/2019|T0043 |42 |2019|12 |
63+
|c2 |4/12/2018|T0091 |29 |2018|12 |
64+
+-----------+---------+--------------+------+----+-----+
65+
only showing top 20 rows
66+
67+
root
68+
|-- customer_id: string (nullable = true)
69+
|-- date: string (nullable = true)
70+
|-- transaction_id: string (nullable = true)
71+
|-- amount: string (nullable = true)
72+
|-- year: integer (nullable = true)
73+
|-- month: integer (nullable = true)
74+
75+
df3::
76+
+-----------+---------+--------------+------+----+-----+
77+
|customer_id|date |transaction_id|amount|year|month|
78+
+-----------+---------+--------------+------+----+-----+
79+
|c1 |4/12/2019|T0023 |48 |2019|12 |
80+
|c1 |4/12/2019|T0043 |42 |2019|12 |
81+
|c2 |3/12/2019|T0513 |67 |2019|12 |
82+
|c2 |3/12/2019|T0014 |42 |2019|12 |
83+
|c2 |4/12/2019|T0043 |42 |2019|12 |
84+
|c1 |4/12/2018|T0051 |28 |2018|12 |
85+
|c1 |4/12/2018|T0091 |29 |2018|12 |
86+
|c2 |4/12/2018|T0091 |29 |2018|12 |
87+
|c1 |2/9/2019 |T0011 |20 |2019|9 |
88+
|c1 |2/9/2019 |T0012 |12 |2019|9 |
89+
|c1 |3/9/2019 |T0013 |30 |2019|9 |
90+
|c1 |3/9/2019 |T0014 |42 |2019|9 |
91+
|c2 |2/9/2019 |T0061 |25 |2019|9 |
92+
|c2 |2/9/2019 |T0062 |78 |2019|9 |
93+
|c2 |2/10/2019|T0511 |20 |2019|10 |
94+
|c2 |2/10/2019|T0612 |17 |2019|10 |
95+
|c2 |4/10/2019|T0023 |48 |2019|10 |
96+
|c1 |1/3/2018 |T0002 |12 |2018|3 |
97+
|c1 |4/3/2018 |T0003 |44 |2018|3 |
98+
|c2 |1/9/2018 |T0002 |12 |2018|9 |
99+
+-----------+---------+--------------+------+----+-----+
100+
only showing top 20 rows
101+
102+
root
103+
|-- customer_id: string (nullable = true)
104+
|-- date: string (nullable = true)
105+
|-- transaction_id: string (nullable = true)
106+
|-- amount: string (nullable = true)
107+
|-- year: integer (nullable = true)
108+
|-- month: integer (nullable = true)

0 commit comments

Comments
 (0)
Please sign in to comment.