Added BookRecommender

repl · sanjuveapen · commit 584cb7495f26 · 2019-08-23T16:15:55.000-07:00
diff --git a/build.sbt b/build.sbt
@@ -17,6 +17,8 @@ libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.3"
 
 libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3"
 
+libraryDependencies += "org.apache.spark" %% "spark-mllib" % "2.4.3"
+
 libraryDependencies += "org.apache.kafka" %% "kafka" % "2.2.0"
 
 libraryDependencies += "org.mongodb.spark" %% "mongo-spark-connector" % "2.4.0"
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1 +1,3 @@
 logLevel := Level.Warn
+
+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4")
diff --git a/src/main/scala/org/repl/kafkasparkmongo/BXBookUserRatingsLoader.scala b/src/main/scala/org/repl/kafkasparkmongo/BXBookUserRatingsLoader.scala
@@ -1,18 +1,18 @@
 package org.repl.kafkasparkmongo
 
-import java.util.{Arrays, Properties}
+import java.util.{ Arrays, Properties }
 
 import com.mongodb.spark._
-import com.mongodb.spark.config.{ReadConfig, WriteConfig}
+import com.mongodb.spark.config.{ ReadConfig, WriteConfig }
 import com.mongodb.spark.sql._
 import com.mongodb.spark.sql.fieldTypes.ObjectId
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
-import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-import org.apache.spark.{SparkConf, SparkContext}
-import org.repl.kafkasparkmongo.util.{SimpleKafkaClient, SparkKafkaSink}
-import org.apache.spark.sql.functions.{col, concat, lit, lower, split, substring, typedLit, udf}
+import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType }
+import org.apache.spark.streaming.kafka010.{ ConsumerStrategies, KafkaUtils, LocationStrategies }
+import org.apache.spark.streaming.{ Seconds, StreamingContext }
+import org.apache.spark.{ SparkConf, SparkContext }
+import org.repl.kafkasparkmongo.util.{ SimpleKafkaClient, SparkKafkaSink }
+import org.apache.spark.sql.functions.{ col, concat, lit, lower, split, substring, typedLit, udf }
 import org.mindrot.jbcrypt.BCrypt
 
 import scala.util.parsing.json.JSONObject
@@ -35,9 +35,7 @@ object BXBookUserRatingsLoader {
       LocationStrategies.PreferConsistent,
       ConsumerStrategies.Subscribe[String, String](
         Arrays.asList(topic),
-        props.asInstanceOf[java.util.Map[String, Object]]
-      )
-    )
+        props.asInstanceOf[java.util.Map[String, Object]]))
 
     val writeConfig = WriteConfig(Map("uri" -> "mongodb://lms:qwerty123@127.0.0.1/lms_db.UserBookRating"))
 
@@ -50,7 +48,7 @@ object BXBookUserRatingsLoader {
         // the number of partitions of the topic (which also happens to be four.)
         println("*** " + r.getNumPartitions + " partitions")
         r.glom().foreach(a => println("*** partition size = " + a.size))
-        val toObjectId = udf[ObjectId,String](new ObjectId(_))
+        val toObjectId = udf[ObjectId, String](new ObjectId(_))
         val df = spark.read.json(r).withColumn("uid", toObjectId(col("userId")))
         df.printSchema()
         //r.foreach(s => println(s))
@@ -94,12 +92,12 @@ object BXBookUserRatingsLoader {
   }
 
   /**
-    * Publish some data to a topic. Encapsulated here to ensure serializable.
-    *
-    * @param sc
-    * @param topic
-    * @param config
-    */
+   * Publish some data to a topic. Encapsulated here to ensure serializable.
+   *
+   * @param sc
+   * @param topic
+   * @param config
+   */
   def send(sc: SparkContext, topic: String, config: Properties): Unit = {
     val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
 
@@ -113,8 +111,7 @@ object BXBookUserRatingsLoader {
     val mySchema = StructType(Array(
       StructField("usernum", StringType),
       StructField("ISBN", StringType),
-      StructField("rating", StringType)
-    ))
+      StructField("rating", StringType)))
     val dataFrame = spark.sqlContext
       .read
       .format("csv")
@@ -138,8 +135,7 @@ object BXBookUserRatingsLoader {
         col("firstname"),
         col("lastname"),
         col("ISBN"),
-        col("ratingNum").as("rating")
-      )
+        col("ratingNum").as("rating"))
     println("JoinedDF schema")
     joinedDF.printSchema()
     println("BookRatings count in joined dataframe: " + joinedDF.count())
@@ -159,9 +155,9 @@ object BXBookUserRatingsLoader {
         kafkaSink.value.send(topic, userId, JSONObject(mutableRowMap.toMap).toString())
       } catch {
         case npe: NullPointerException => {
-            println("Got NPE for rowMap " + rowMap)
+          println("Got NPE for rowMap " + rowMap)
         }
-        case e : Throwable => {
+        case e: Throwable => {
           println(e)
         }
       }
diff --git a/src/main/scala/org/repl/kafkasparkmongo/samples/BookRecommendation.scala b/src/main/scala/org/repl/kafkasparkmongo/samples/BookRecommendation.scala
@@ -0,0 +1,109 @@
+package org.repl.kafkasparkmongo.samples
+
+import com.mongodb.spark._
+import com.mongodb.spark.config.{ReadConfig, WriteConfig}
+import com.mongodb.spark.sql._
+import com.mongodb.spark.sql.fieldTypes.ObjectId
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.{Row, SaveMode, SparkSession, functions}
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
+import org.apache.spark.sql.functions.{col, concat, desc, lit, lower, split, substring, typedLit, udf}
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.ml.recommendation.ALS
+import org.apache.spark.rdd.RDD
+import org.bson.Document
+
+object BookRecommendation {
+
+  case class students_cc(id: Int,
+                         year_graduated: String,
+                         name: String)
+
+  def main(args: Array[String]): Unit = {
+    //Start the Spark context
+    val conf = new SparkConf().setAppName("SimpleStreamingFromRDD").setMaster("local[4]")
+    val sc = new SparkContext(conf)
+    val ssc = new StreamingContext(sc, Seconds(1))
+
+    val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
+
+    //Ways to read mongodb
+    //val usersDF = spark.read.mongo(ReadConfig(Map("uri" -> "mongodb://lms:qwerty123@127.0.0.1/lms_db.User")))
+    //println("Users count: " + usersDF.count())
+    //println("usersDF schema")
+    //usersDF.printSchema()
+    //val df = spark.sqlContext.read.format("com.mongodb.spark.sql.DefaultSource").option("spark.mongodb.input.uri", "mongodb://lms:qwerty123@127.0.0.1/lms_db.User").load()
+    //usersDF.printSchema()
+               
+    //Another way to read data from MongoDB
+    val bookDF = MongoSpark.load(spark.sparkContext, ReadConfig(Map("uri" -> "mongodb://lms:qwerty123@127.0.0.1/lms_db.Book"))).toDF()
+      .drop("Image-URL-L", "Image-URL-M", "Image-URL-S")
+    //uDF.show(false)
+    //uDF.printSchema()
+    //val aggregatedRdd = uDF.withPipeline(Seq(Document.parse("{ '$match': { 'Book-Title' : {'$regex' : '^Sherlock' } } }")))
+    //println(aggregatedRdd.count)
+
+    //Simple Popularity based Recommendation System
+    val bookRatingDF = MongoSpark.load(spark.sparkContext, ReadConfig(Map("uri" -> "mongodb://lms:qwerty123@127.0.0.1/lms_db.UserBookRating"))).toDF()
+    val raters = bookRatingDF.groupBy(functions.col("ISBN")).agg(functions.count("rating").as("count"))
+    val topRaters = raters.sort(desc("count")).toDF().limit(10)
+    val joinedDF = topRaters.join(bookDF, Seq("ISBN"))
+    joinedDF.show(false)
+    joinedDF.printSchema()
+
+    //Collaborative Filtering using ALS (alternating least squares) Spark ML
+    import spark.sqlContext.implicits._
+    //create DF with userId as integer (ALS requires integer for Ids)
+    val stringindexer1 = new StringIndexer().setInputCol("userId").setOutputCol("userIdNum")
+    val modelc1 = stringindexer1.fit(bookRatingDF)
+    val bookRatingT1DF = modelc1.transform(bookRatingDF)
+    val stringindexer2 = new StringIndexer().setInputCol("ISBN").setOutputCol("isbnNum")
+    val modelc2 = stringindexer2.fit(bookRatingT1DF)
+    val bookRatingNewDF = modelc2.transform(bookRatingT1DF)
+    //TODO: save the mapping userId -> userNum and ISBN -> isbnNum mapping in mongodb
+
+    val Array(training, test) = bookRatingNewDF.randomSplit(Array(0.8, 0.2))
+    // Build the recommendation model using ALS on the training data
+    val als = new ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userIdNum").setItemCol("isbnNum").setRatingCol("rating")
+    val model = als.fit(training)
+    // Evaluate the model by computing the RMSE on the test data
+    // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
+    model.setColdStartStrategy("drop")
+    val predictions = model.transform(test)
+    val evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction")
+    val rmse = evaluator.evaluate(predictions)
+    println(s"Root-mean-square error = $rmse")
+    //use the model to generate a set of 10 recommended movies for each user in the dataset,
+    //val docs  = predictions.map( r => {
+    //  ( r.getAs("userId"), r.getAs("ISBN"),  r.getAs("rating") )
+    //} )
+    //  .toDF( "userId", "ISBN", "rating" )
+
+    //println(s"Generate top 3 movie recommendations for each user")
+    //val userRecs = model.recommendForAllUsers(3).show(truncate = false)
+    //println(s"Generate top 3 user recommendations for each book")
+    //val bookRecs = model.recommendForAllItems(3).show(truncate = false)
+
+    println(s"Generate top 3 movie recommendations for a specified set of users (3)")
+    val users = bookRatingNewDF.select(als.getUserCol).distinct().limit(3)
+    val userSubsetRecs = model.recommendForUserSubset(users, 3).show(truncate = false)
+    println(s"Generate top 3 user recommendations for a specified set of books (3)")
+    val books = bookRatingNewDF.select(als.getItemCol).distinct().limit(3)
+    val booksSubSetRecs = model.recommendForItemSubset(books, 3).show(truncate = false)
+
+    // Directly from prediction table: where is for case when we have big DataFrame with many users
+    //model.transform (bookRatingNewDF.where('userId === givenUserId))
+    //  .select ('isbn, 'prediction)
+    //  .orderBy('prediction.desc)
+    //  .limit(N)
+    //  .map { case Row (isbn: Int, prediction: Double) => (isbn, prediction) }
+    //  .collect()
+  }
+}
+
+
+
+
+
diff --git a/src/main/scala/org/repl/kafkasparkmongo/samples/SparkScalaMongo.scala b/src/main/scala/org/repl/kafkasparkmongo/samples/SparkScalaMongo.scala
@@ -45,6 +45,74 @@ object SparkScalaMongo {
     //Load data again to check if the insert was successful
     val studentsData = MongoSpark.load(spark)
     studentsData.show(false)
-
   }
+
+  //https://github.com/mongodb/mongo-spark/tree/master/examples/src/test/scala/tour
+
+  //Read
+  //val df3 = sparkSession.loadFromMongoDB(ReadConfig(Map("uri" -> "mongodb://example.com/database.collection"))) // ReadConfig used for configuration
+  //val df4 = sparkSession.read.mongo() // SparkSession used for configuration
+  //sqlContext.read.format("mongo").load()
+  //usersDF = spark.read.mongo(ReadConfig(Map("uri" -> "mongodb://lms:qwerty123@127.0.0.1/lms_db.User")))
+  //MongoSpark.load[Character](sparkSession, ReadConfig(Map("collection" -> "hundredClub"), Some(ReadConfig(sparkSession)))).show()
+
+
+  //# Write to Spark
+  //#1
+  //sparkSession.sparkContext.parallelize(docs.map(Document.parse)).saveToMongoDB()
+  //#2
+  //sparkdf.write.option("collection", "hundredClub").mode("overwrite").mongo()
+  //sparkdf.write.option("collection", "hundredClub").format("mongo").save()
+  //sparkdf.write.format("com.mongodb.spark.sql.DefaultSource")
+  //             .mode("append").option("spark.mongodb.output.uri", "mongodb://host101:27017/dbName.collName")
+  //             .option("replaceDocument", "false")
+  //             .save()
+  //#3
+  //MongoSpark.save(df,WriteConfig).mode("overwrite")) //drops collection before writing the results, if the collection already exists.
+
+
+
+
+  //query
+  //personDf.select($"_id", $"addresses"(0)("street"), $"country"("name"))
+  //val aggregatedRdd = uDF.withPipeline(Seq(Document.parse("{ '$match': { 'Book-Title' : {'$regex' : '^Sherlock' } } }")))
+
+  //mongodb
+  //db.zipcodes.aggregate( [
+  //{ $group: { _id: "$state", totalPop: { $sum: "$pop" } } },
+  //{ $match: { totalPop: { $gte: 10*1000*1000 } } }
+  //] )
+  //spark equivalent
+  //println( "States with Populations above 10 Million" )
+  //import zipDf.sqlContext.implicits._ // 1)
+  //zipDf.groupBy("state")
+  //.sum("pop")
+  //.withColumnRenamed("sum(pop)", "count") // 2)
+  //.filter($"count" > 10000000)
+  //.show()
+
+  //* aggregate with connector
+  //val aggregatedRdd = uDF.withPipeline(Seq(Document.parse("{ '$match': { 'Book-Title' : {'$regex' : '^Sherlock' } } }")))
+
+  //BasicDBObject dateRange = new BasicDBObject ("$gte", new Date(current.getYear(), current.getMonth(), current.getDate());
+  //dateRange.put("$lt", new Date(current.getYear(), current.getMonth() - 1, current.getDate());
+  //BasicDBObject query = new BasicDBObject("created_on", dateRange);
+  //OR BasicDBObject query = new BasicDBObject("created_on", new BasicDBObject("$gte", new DateTime().toDate()).append("$lt", new DateTime().toDate()));
+  //rdd.withPipeline(singletonList(query));
+
+  //using predicate pushdown (filter and select)
+  //zipDf
+  //.filter($"pop" > 0)
+  //.select("state")
+  //.explain(true)
+
+  //using sqlcontext
+  //zipDf.createOrReplaceTempView("zips") // 1)
+  //zipDf.sqlContext.sql( // 2)
+  //"""SELECT state, sum(pop) AS count
+  //  FROM zips
+  //  GROUP BY state
+  //  HAVING sum(pop) > 10000000"""
+  //)
+  //.show()
 }

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`logLevel := Level.Warn`
	`2`	`+`
	`3`	`+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4")`