Github-Classroom-Cybros · anant-dev · Oct 31, 2017 · Oct 31, 2017
diff --git a/Awesome-Scripts/List of ideas.md b/Awesome-Scripts/List of ideas.md
@@ -51,3 +51,5 @@ This program renames a list of tv episodes. Episodes can be nested inside folder
 ### 22. [Reddit Scrapping.py](reddit_scrapping.py)
 Scrapper for Reddit.
 
+### 23. [Amazon Review Scrapping and storing in CSV file](amazonReviewInCsv.py)
+* Just Enter **Product Name** and the script scrapes product reviews (author, date, rating, review), to avoid blocking the ip a randaom delay is added between requests and stores the data in CSV file so that it can be used for further analysis(sentiment analysis).
diff --git a/Awesome-Scripts/amazonReviewInCsv.py b/Awesome-Scripts/amazonReviewInCsv.py
@@ -0,0 +1,48 @@
+import requests
+import csv
+import time
+from random import randint
+from bs4 import BeautifulSoup
+string = raw_input("Enter the Product Name ")
+url="http://www.amazon.in/s/keywords="+string
+r=requests.get(url)
+soup = BeautifulSoup(r.content,"lxml")
+link = soup.find("li",{"id":"result_0"})['data-asin']
+limit=0
+with open(string+'.csv', 'a') as csvfile:
+    fieldnames = ['Author', 'date', 'Rating', 'title', 'review']
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    writer.writeheader()
+    for i in range(1,200):
+        rurl="http://www.amazon.in/product-reviews/"+link+"/ref=cm_cr_arp_d_paging_btm_2?pageNumber="+str(i)
+        r = requests.get(rurl)
+        soup = BeautifulSoup(r.content, "lxml")
+        review = soup.find_all("div",{"class": "review"})
+        print "scrapping page = " + str(i)
+        if review == []:
+            delay = randint(0,5)
+            print "delay ="+str(delay)
+            i=i-1
+            limit=limit+1
+            time.sleep(delay)
+            if(limit == 5):
+                print "NO MORE REVIEWS"
+                break
+        else :
+            for item in review:
+                limit = 0
+                author= item.find("a", {"class": "author"}).text
+                rate = item.find("span", {"class": "a-icon-alt"}).text
+                title = item.find("a", {"class": "review-title"}).text
+                date = item.find("span",{"class":"review-date"}).text
+                text = item.find("span", {"class": "review-text"}).text
+                #print rate+" /////  "+title+"///"+date+"////"+text+"\n"
+                try:
+                    writer.writerow(
+                        {'Author': author,
+                         'date': date ,
+                         'Rating': rate,
+                         'title': title,
+                         'review': text})
+                except Exception:
+                    pass