From 558fda89480519de817662e373fbcb6731f3e180 Mon Sep 17 00:00:00 2001 From: cherukuri12 <35265453+cherukuri12@users.noreply.github.com> Date: Tue, 1 Oct 2019 16:36:28 +0530 Subject: [PATCH 1/2] created webscarping.py for html parsing --- scripts/webscraping.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 scripts/webscraping.py diff --git a/scripts/webscraping.py b/scripts/webscraping.py new file mode 100644 index 0000000..96f750f --- /dev/null +++ b/scripts/webscraping.py @@ -0,0 +1,22 @@ +import requests +from bs4 import BeautifulSoup + +def cars_brand_links(): + url = 'https://www.carsprite.com/en/car-prices' + source_code = requests.get(url) + plain_text = source_code.text + soup = BeautifulSoup(plain_text) + for link in soup.findAll("a"): + href = link.get('href') + if "car-prices" not in href: + pass + else: + data = href + i = 9 + while i < 49: + print(data[i]) + i += 1 + + + +cars_brand_links() From 441058cad1a70c11a35449ed652d04dbd658c53a Mon Sep 17 00:00:00 2001 From: cherukuri12 <35265453+cherukuri12@users.noreply.github.com> Date: Tue, 1 Oct 2019 16:39:14 +0530 Subject: [PATCH 2/2] Update webscraping.py --- scripts/webscraping.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/scripts/webscraping.py b/scripts/webscraping.py index 96f750f..2dfd00c 100644 --- a/scripts/webscraping.py +++ b/scripts/webscraping.py @@ -2,21 +2,33 @@ from bs4 import BeautifulSoup def cars_brand_links(): - url = 'https://www.carsprite.com/en/car-prices' + url = 'https://www.carsprite.com/en/car-prices/' source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text) for link in soup.findAll("a"): - href = link.get('href') - if "car-prices" not in href: + href = "https://www.carsprite.com/en/" + link.get('href') + if "car-prices/" not in href: pass else: data = href - i = 9 - while i < 49: - print(data[i]) - i += 1 + get_single_item_data(data) +def get_single_item_data(brand_url): + source_code = requests.get(brand_url) + plain_text = source_code.text + soup = BeautifulSoup(plain_text) + for link in soup.findAll("a"): + href1 = link.get('href') + if "/en/" not in href1: + data1 = href1 + if "https" not in data1: + data2 = data1 + if "/car-prices/" not in data2: + data_final = 'https://www.carsprite.com/en/car-prices/' + data2 + print(data_final) + else: + pass cars_brand_links()