|
| 1 | +package org.repl.kafkasparkmongo.samples |
| 2 | + |
| 3 | +import com.mongodb.spark._ |
| 4 | +import com.mongodb.spark.config.{ReadConfig, WriteConfig} |
| 5 | +import com.mongodb.spark.sql._ |
| 6 | +import com.mongodb.spark.sql.fieldTypes.ObjectId |
| 7 | +import org.apache.spark.{SparkConf, SparkContext} |
| 8 | +import org.apache.spark.sql.{Row, SaveMode, SparkSession, functions} |
| 9 | +import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} |
| 10 | +import org.apache.spark.sql.functions.{col, concat, desc, lit, lower, split, substring, typedLit, udf} |
| 11 | +import org.apache.spark.streaming.{Seconds, StreamingContext} |
| 12 | +import org.apache.spark.ml.evaluation.RegressionEvaluator |
| 13 | +import org.apache.spark.ml.feature.StringIndexer |
| 14 | +import org.apache.spark.ml.recommendation.ALS |
| 15 | +import org.apache.spark.rdd.RDD |
| 16 | +import org.bson.Document |
| 17 | + |
| 18 | +object BookRecommendation { |
| 19 | + |
| 20 | + case class students_cc(id: Int, |
| 21 | + year_graduated: String, |
| 22 | + name: String) |
| 23 | + |
| 24 | + def main(args: Array[String]): Unit = { |
| 25 | + //Start the Spark context |
| 26 | + val conf = new SparkConf().setAppName("SimpleStreamingFromRDD").setMaster("local[4]") |
| 27 | + val sc = new SparkContext(conf) |
| 28 | + val ssc = new StreamingContext(sc, Seconds(1)) |
| 29 | + |
| 30 | + val spark = SparkSession.builder.config(sc.getConf).getOrCreate() |
| 31 | + |
| 32 | + //Ways to read mongodb |
| 33 | + //val usersDF = spark.read.mongo(ReadConfig(Map("uri" -> "mongodb://lms:[email protected]/lms_db.User"))) |
| 34 | + //println("Users count: " + usersDF.count()) |
| 35 | + //println("usersDF schema") |
| 36 | + //usersDF.printSchema() |
| 37 | + //val df = spark.sqlContext.read.format("com.mongodb.spark.sql.DefaultSource").option("spark.mongodb.input.uri", "mongodb://lms:[email protected]/lms_db.User").load() |
| 38 | + //usersDF.printSchema() |
| 39 | + |
| 40 | + //Another way to read data from MongoDB |
| 41 | + val bookDF = MongoSpark.load(spark.sparkContext, ReadConfig( Map( "uri" -> "mongodb://lms:[email protected]/lms_db.Book"))).toDF() |
| 42 | + .drop("Image-URL-L", "Image-URL-M", "Image-URL-S") |
| 43 | + //uDF.show(false) |
| 44 | + //uDF.printSchema() |
| 45 | + //val aggregatedRdd = uDF.withPipeline(Seq(Document.parse("{ '$match': { 'Book-Title' : {'$regex' : '^Sherlock' } } }"))) |
| 46 | + //println(aggregatedRdd.count) |
| 47 | + |
| 48 | + //Simple Popularity based Recommendation System |
| 49 | + val bookRatingDF = MongoSpark.load(spark.sparkContext, ReadConfig( Map( "uri" -> "mongodb://lms:[email protected]/lms_db.UserBookRating"))).toDF() |
| 50 | + val raters = bookRatingDF.groupBy(functions.col("ISBN")).agg(functions.count("rating").as("count")) |
| 51 | + val topRaters = raters.sort(desc("count")).toDF().limit(10) |
| 52 | + val joinedDF = topRaters.join(bookDF, Seq("ISBN")) |
| 53 | + joinedDF.show(false) |
| 54 | + joinedDF.printSchema() |
| 55 | + |
| 56 | + //Collaborative Filtering using ALS (alternating least squares) Spark ML |
| 57 | + import spark.sqlContext.implicits._ |
| 58 | + //create DF with userId as integer (ALS requires integer for Ids) |
| 59 | + val stringindexer1 = new StringIndexer().setInputCol("userId").setOutputCol("userIdNum") |
| 60 | + val modelc1 = stringindexer1.fit(bookRatingDF) |
| 61 | + val bookRatingT1DF = modelc1.transform(bookRatingDF) |
| 62 | + val stringindexer2 = new StringIndexer().setInputCol("ISBN").setOutputCol("isbnNum") |
| 63 | + val modelc2 = stringindexer2.fit(bookRatingT1DF) |
| 64 | + val bookRatingNewDF = modelc2.transform(bookRatingT1DF) |
| 65 | + //TODO: save the mapping userId -> userNum and ISBN -> isbnNum mapping in mongodb |
| 66 | + |
| 67 | + val Array(training, test) = bookRatingNewDF.randomSplit(Array(0.8, 0.2)) |
| 68 | + // Build the recommendation model using ALS on the training data |
| 69 | + val als = new ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userIdNum").setItemCol("isbnNum").setRatingCol("rating") |
| 70 | + val model = als.fit(training) |
| 71 | + // Evaluate the model by computing the RMSE on the test data |
| 72 | + // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics |
| 73 | + model.setColdStartStrategy("drop") |
| 74 | + val predictions = model.transform(test) |
| 75 | + val evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("rating").setPredictionCol("prediction") |
| 76 | + val rmse = evaluator.evaluate(predictions) |
| 77 | + println(s"Root-mean-square error = $rmse") |
| 78 | + //use the model to generate a set of 10 recommended movies for each user in the dataset, |
| 79 | + //val docs = predictions.map( r => { |
| 80 | + // ( r.getAs("userId"), r.getAs("ISBN"), r.getAs("rating") ) |
| 81 | + //} ) |
| 82 | + // .toDF( "userId", "ISBN", "rating" ) |
| 83 | + |
| 84 | + //println(s"Generate top 3 movie recommendations for each user") |
| 85 | + //val userRecs = model.recommendForAllUsers(3).show(truncate = false) |
| 86 | + //println(s"Generate top 3 user recommendations for each book") |
| 87 | + //val bookRecs = model.recommendForAllItems(3).show(truncate = false) |
| 88 | + |
| 89 | + println(s"Generate top 3 movie recommendations for a specified set of users (3)") |
| 90 | + val users = bookRatingNewDF.select(als.getUserCol).distinct().limit(3) |
| 91 | + val userSubsetRecs = model.recommendForUserSubset(users, 3).show(truncate = false) |
| 92 | + println(s"Generate top 3 user recommendations for a specified set of books (3)") |
| 93 | + val books = bookRatingNewDF.select(als.getItemCol).distinct().limit(3) |
| 94 | + val booksSubSetRecs = model.recommendForItemSubset(books, 3).show(truncate = false) |
| 95 | + |
| 96 | + // Directly from prediction table: where is for case when we have big DataFrame with many users |
| 97 | + //model.transform (bookRatingNewDF.where('userId === givenUserId)) |
| 98 | + // .select ('isbn, 'prediction) |
| 99 | + // .orderBy('prediction.desc) |
| 100 | + // .limit(N) |
| 101 | + // .map { case Row (isbn: Int, prediction: Double) => (isbn, prediction) } |
| 102 | + // .collect() |
| 103 | + } |
| 104 | +} |
| 105 | + |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + |
0 commit comments