diff --git a/Awesome-Scripts/List of ideas.md b/Awesome-Scripts/List of ideas.md index 27ff76c..bde6cf2 100644 --- a/Awesome-Scripts/List of ideas.md +++ b/Awesome-Scripts/List of ideas.md @@ -51,3 +51,5 @@ This program renames a list of tv episodes. Episodes can be nested inside folder ### 22. [Reddit Scrapping.py](reddit_scrapping.py) Scrapper for Reddit. +### 23. [Amazon Review Scrapping and storing in CSV file](amazonReviewInCsv.py) +* Just Enter **Product Name** and the script scrapes product reviews (author, date, rating, review), to avoid blocking the ip a randaom delay is added between requests and stores the data in CSV file so that it can be used for further analysis(sentiment analysis). diff --git a/Awesome-Scripts/amazonReviewInCsv.py b/Awesome-Scripts/amazonReviewInCsv.py new file mode 100644 index 0000000..546aff2 --- /dev/null +++ b/Awesome-Scripts/amazonReviewInCsv.py @@ -0,0 +1,48 @@ +import requests +import csv +import time +from random import randint +from bs4 import BeautifulSoup +string = raw_input("Enter the Product Name ") +url="http://www.amazon.in/s/keywords="+string +r=requests.get(url) +soup = BeautifulSoup(r.content,"lxml") +link = soup.find("li",{"id":"result_0"})['data-asin'] +limit=0 +with open(string+'.csv', 'a') as csvfile: + fieldnames = ['Author', 'date', 'Rating', 'title', 'review'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for i in range(1,200): + rurl="http://www.amazon.in/product-reviews/"+link+"/ref=cm_cr_arp_d_paging_btm_2?pageNumber="+str(i) + r = requests.get(rurl) + soup = BeautifulSoup(r.content, "lxml") + review = soup.find_all("div",{"class": "review"}) + print "scrapping page = " + str(i) + if review == []: + delay = randint(0,5) + print "delay ="+str(delay) + i=i-1 + limit=limit+1 + time.sleep(delay) + if(limit == 5): + print "NO MORE REVIEWS" + break + else : + for item in review: + limit = 0 + author= item.find("a", {"class": "author"}).text + rate = item.find("span", {"class": "a-icon-alt"}).text + title = item.find("a", {"class": "review-title"}).text + date = item.find("span",{"class":"review-date"}).text + text = item.find("span", {"class": "review-text"}).text + #print rate+" ///// "+title+"///"+date+"////"+text+"\n" + try: + writer.writerow( + {'Author': author, + 'date': date , + 'Rating': rate, + 'title': title, + 'review': text}) + except Exception: + pass \ No newline at end of file