Get distinct values from each column as list

mdrkb · mdrkb · commit 4d7666a9b2ee · 2020-01-17T14:42:37.000+01:00
diff --git a/build.sbt b/build.sbt
@@ -3,5 +3,7 @@ version := "0.1"
 scalaVersion := "2.12.10"
 
 // https://mvnrepository.com/artifact/org.apache.spark/spark-core
-libraryDependencies += "org.apache.spark" %% "spark-core" % "3.0.0-preview2"
-libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.0.0-preview2"
+libraryDependencies ++= Seq(
+  "org.apache.spark" %% "spark-core" % "3.0.0-preview2",
+  "org.apache.spark" %% "spark-sql" % "3.0.0-preview2"
+)
diff --git a/src/main/scala/SparkTutorial.scala b/src/main/scala/SparkTutorial.scala
@@ -4,109 +4,37 @@ import org.apache.spark.sql.SparkSession
 object SparkTutorial {
   def main(args: Array[String]): Unit = {
 
-    // Turn off logging
     Logger.getLogger("org").setLevel(Level.OFF)
     Logger.getLogger("akka").setLevel(Level.OFF)
 
-    //--------------------------------------------------------------------------------------------------------
-    // Setting up a Spark Session
-    //--------------------------------------------------------------------------------------------------------
-
-    // Create a SparkSession to work with Spark
+    // Create SparkSession
     val sparkBuilder = SparkSession
       .builder()
       .appName("SparkTutorial")
-      .master("local[4]") // local, with 4 worker cores
+      .master("local[4]")
     val spark = sparkBuilder.getOrCreate()
 
-    // Set the default number of shuffle partitions (default is 200, which is too high for local deployment)
-    spark.conf.set("spark.sql.shuffle.partitions", "8") //
+    // Set the default number of shuffle partitions
+    spark.conf.set("spark.sql.shuffle.partitions", "8")
 
     // Importing implicit encoders for standard library classes and tuples that are used as Dataset types
     import spark.implicits._
 
-    println("-----------------------------------------------------------------------------------------------")
-
-    //--------------------------------------------------------------------------------------------------------
-    // Loading data
-    //--------------------------------------------------------------------------------------------------------
-
-    // Create a Dataset programmatically
-    val numbers = spark.createDataset((0 until 100).toList)
-
-    // Read a Dataset from a file
-    val customers = spark.read
+    // Read Dataset from the file
+    val df = spark.read
       .option("inferSchema", "true")
       .option("header", "true")
       .option("sep", ";")
-      .csv("data/tpch_customer.csv") // also text, json, jdbc, parquet
-      .as[(Int, String, String, Int, String, String, String, String)]
-
-    println("-----------------------------------------------------------------------------------------------")
-
-    //--------------------------------------------------------------------------------------------------------
-    // Basic transformations
-    //--------------------------------------------------------------------------------------------------------
-
-    // Basic transformations on datasets return new datasets
-    val mapped = numbers.map(i => "This is a number: " + i)
-    val filtered = mapped.filter(s => s.contains("1"))
-    val sorted = filtered.sort()
-    List(numbers, mapped, filtered, sorted).foreach(dataset => println(dataset.getClass))
-    sorted.show()
-
-    println("-----------------------------------------------------------------------------------------------")
-
-    // Basic terminal operations
-    val collected = filtered.collect() // collects the entire dataset to the driver process
-    val reduced = filtered.reduce((s1, s2) => s1 + "," + s2) // reduces all values successively to one
-    filtered.foreach(s => println(s)) // performs an action for each element (take care where the action is evaluated!)
-    List(collected, reduced).foreach(result => println(result.getClass))
-
-    println("-----------------------------------------------------------------------------------------------")
-
-    // DataFrame and Dataset
-    val untypedDF = numbers.toDF() // DS to DF
-    val stringTypedDS = untypedDF.map(r => r.get(0).toString) // DF to DS via map
-    val integerTypedDS = untypedDF.as[Int] // DF to DS via as() function that cast columns to a concrete types
-    List(untypedDF, stringTypedDS, integerTypedDS).foreach(result => println(result.head.getClass))
-    List(untypedDF, stringTypedDS, integerTypedDS).foreach(result => println(result.head))
-
-    println("-----------------------------------------------------------------------------------------------")
-
-    // Mapping to tuples
-    numbers
-      .map(i => (i, "nonce", 3.1415, true))
-      .take(10)
-      .foreach(println(_))
-
-    println("-----------------------------------------------------------------------------------------------")
-
-    // SQL on DataFrames
-    customers.createOrReplaceTempView("customers") // make this dataframe visible as a table
-    val sqlResult = spark.sql("SELECT * FROM customers WHERE C_NATIONKEY = 15") // perform an sql query on the table
-
-    import org.apache.spark.sql.functions._
+      .csv("data/test_customer.csv")
 
-    sqlResult // DF
-      .as[(Int, String, String, Int, String, String, String, String)] // DS
-      .sort(desc("C_NATIONKEY")) // desc() is a standard function from the spark.sql.functions package
-      .head(10)
-      .foreach(println(_))
+    val columns = df.columns.toList
 
-    println("-----------------------------------------------------------------------------------------------")
+    columns.foreach(column => {
+      val names = df.select(column).distinct.map(_.get(0).toString).collect.toList
+      println(names)
+    })
 
-    // Grouping and aggregation for Datasets
-//    val topEarners = customers
-//      .groupByKey { case (name, age, salary, company) => company }
-//      .mapGroups { case (key, iterator) =>
-//        val topEarner = iterator.toList.maxBy(t => t._3) // could be problematic: Why?
-//        (key, topEarner._1, topEarner._3)
-//      }
-//      .sort(desc("_3"))
-//    topEarners.collect().foreach(t => println(t._1 + "'s top earner is " + t._2 + " with salary " + t._3))
 
-    println("-----------------------------------------------------------------------------------------------")
 
   }
 }