Skip to content

Commit 0259b6d

Browse files
committed
scraping
1 parent dd932fd commit 0259b6d

File tree

6 files changed

+183
-3
lines changed

6 files changed

+183
-3
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.07.15
4+
#
5+
# title: Trouble outputting data with Scrapy
6+
# url: https://stackoverflow.com/questions/68386890/trouble-outputting-data-with-scrapy/68387811#68387811
7+
8+
9+
import scrapy
10+
from scrapy.http import Request
11+
12+
class ArticlesSpider(scrapy.Spider):
13+
name = 'articles'
14+
allowed_domains = ['artofmanliness.com']
15+
max_pages = 2
16+
17+
def start_requests(self):
18+
for i in range(self.max_pages):
19+
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
20+
21+
def parse(self, response):
22+
# AOM has a list of all articles in pages of about 189
23+
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
24+
url = article.xpath('.//a/@href').extract()
25+
print('article url:', url)
26+
27+
if url:
28+
yield Request(url=url[0], callback=self.parse_article)
29+
30+
def parse_article(self, response):
31+
#title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
32+
title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
33+
34+
category = response.xpath('//p[@class="in-category"]//a/text()').extract()
35+
36+
#date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
37+
date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()
38+
39+
yield {
40+
'Title': title,
41+
'Category': category,
42+
'Date': date,
43+
'URL': response.url
44+
}
45+
46+
from scrapy.crawler import CrawlerProcess
47+
48+
c = CrawlerProcess({
49+
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
50+
# save in file CSV, JSON or XML
51+
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
52+
})
53+
c.crawl(ArticlesSpider)
54+
c.start()
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.07.23
4+
#
5+
# title: How to properly extract data that returns None in Python Beautifulsoup
6+
# url: https://stackoverflow.com/questions/68493228/how-to-properly-extract-data-that-returns-none-in-python-beautifulsoup/68493491?noredirect=1#comment121049699_68493491
7+
8+
from bs4 import BeautifulSoup
9+
from urllib import request
10+
from urllib.request import Request, urlopen
11+
12+
13+
url = 'https://bscscan.com/tx/0x1b6f00c8cd99e0daac5718c743ef9a51af40f95feae23bf29960ae1f66a1cff7'
14+
#url = 'https://bscscan.com/tx/0xc54d83b870a1b4159f12bff092c8a24dfa045e133b07d3a3a41898293ac86c71'
15+
headers = {'User-Agent': 'Mozilla/5.0'}
16+
17+
req = Request(url, headers=headers)
18+
html = urlopen(req).read()
19+
soup = BeautifulSoup(html, 'html.parser')
20+
21+
val = soup.find('span', class_='u-label u-label--value u-label--secondary text-dark rounded mr-1').text
22+
transfee = soup.find('span', id='ContentPlaceHolder1_spanTxFee').text
23+
fromaddr = soup.find('span', id='spanFromAdd').text
24+
token = soup.find('span', class_='hash-tag text-truncate hash-tag-custom-from tooltip-address').text
25+
26+
print("From: ", fromaddr)
27+
print("Value: ", val)
28+
print("Transaction Fee: ", transfee)
29+
print("Tokens:")
30+
31+
main_data = soup.select("ul#wrapperContent div.media-body")
32+
33+
for item in main_data:
34+
all_span = item.find_all("span", class_='mr-1')
35+
#for number, span in enumerate(all_span):
36+
# print(number, span.get_text(strip=True))
37+
last_span = all_span[-1]
38+
39+
all_a = item.find_all("a")
40+
last_a = all_a[-1]
41+
42+
print("{:>35} | {:18} | https://bscscan.com{}".format(last_span.get_text(strip=True), last_a.get_text(strip=True), last_a['href']))
43+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.07.23
4+
#
5+
# title: How to iterate through web table with Selenium?
6+
# url: https://stackoverflow.com/questions/68493382/how-to-iterate-through-web-table-with-selenium/68494220#68494220
7+
8+
# DOESN"T WORK WITH FIREFOX BECAUSE SERVER SHOWS CAPTCHA
9+
10+
11+
from selenium import webdriver
12+
#from selenium.webdriver.common.keys import Keys
13+
from selenium.webdriver.common.by import By
14+
from selenium.webdriver.support.ui import WebDriverWait
15+
from selenium.webdriver.support import expected_conditions as EC
16+
import pandas as pd
17+
import time
18+
19+
#paths
20+
#PATH = "C:/Program Files (x86)\chromedriver.exe"
21+
#driver = webdriver.Chrome(PATH)
22+
driver = webdriver.Chrome()
23+
#driver = webdriver.Firefox()
24+
25+
url = "https://www.crunchbase.com/search/organizations/field/organization.companies/categories/electric-vehicle"
26+
driver.get(url)
27+
driver.maximize_window()
28+
time.sleep(5)
29+
30+
print('title:', driver.title)
31+
32+
WebDriverWait(driver, 20).until(
33+
EC.visibility_of_element_located(
34+
(By.XPATH, ('//grid-body//identifier-formatter/a/div/div')
35+
)))
36+
37+
all_rows = driver.find_elements_by_css_selector("grid-row")
38+
39+
all_companies = []
40+
41+
for row in all_rows:
42+
company = {
43+
'name': row.find_element_by_xpath('.//*[@class="identifier-label"]').text.strip(),
44+
'industry': row.find_element_by_xpath('.//*[@data-columnid="categories"]//span').text.strip(),
45+
'hq': row.find_element_by_xpath('.//*[@data-columnid="location_identifiers"]//span').text.strip(),
46+
}
47+
all_companies.append(company)
48+
49+
#create dataframe
50+
df = pd.DataFrame(all_companies)
51+
print(df)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
# author: Bartlomiej "furas" Burek (https://blog.furas.pl)
3+
# date: 2021.07.15
4+
#
5+
# title: How to scrap string that does not have unique ID for data extraction? [closed]
6+
# url: https://stackoverflow.com/questions/68394176/how-to-scrap-string-that-does-not-have-unique-id-for-data-extraction
7+
8+
from bs4 import BeautifulSoup as BS
9+
import requests
10+
11+
url = 'https://www.magicbricks.com/property-for-sale-in-namakkal-pppfs'
12+
r= requests.get(url)
13+
14+
soup = BS(r.text, 'html.parser')
15+
16+
all_items = soup.find_all('span', class_='m-srp-card__title')
17+
for item in all_items:
18+
print('for Sale in', item.text.split('for Sale in')[1].strip() )
19+

__scraping__/pixabay.com - requests, BS/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,13 @@
44
# author: Bartłomiej "furas" Burek (https://blog.furas.pl)
55

66
# https://stackoverflow.com/questions/63767927/cant-scrape-some-static-image-links-from-a-webpage-using-requests
7+
8+
---
9+
10+
Update: 2021.07.15
11+
12+
Web page changed some classes and code needed updates
13+
14+
- `.results--efirA` instead of `.search_results`
15+
16+
- `data-lazy-src` instead of `data-lazy`

__scraping__/pixabay.com - requests, BS/main.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# date: 2020.09.07
33
# author: Bartłomiej "furas" Burek (https://blog.furas.pl)
44
# https://stackoverflow.com/questions/63767927/cant-scrape-some-static-image-links-from-a-webpage-using-requests
5+
#
6+
# update: 2021.07.15
57

68
import requests
79
from bs4 import BeautifulSoup
@@ -42,14 +44,15 @@
4244

4345
soup = BeautifulSoup(r.text, "lxml")
4446

45-
for item in soup.select(".search_results a > img[src]"):
47+
#for item in soup.select("[data-hid='photo_list_results'] a > img[src]"):
48+
for item in soup.select(".results--efirA a > img[src]"):
4649
src = item.get("src")
4750
if src is not None and 'blank.gif' not in src:
4851
print('src:', src)
4952
results.append(src)
5053
else:
51-
src = item.get("data-lazy")
52-
print('data-lazy:', src)
54+
src = item.get("data-lazy-src")
55+
print('data-lazy-src:', src)
5356
results.append(src)
5457

5558
print('len:', len(results))

0 commit comments

Comments
 (0)