From b48c0251c72a9d4a726df9b7e7b6f3a9e0e09842 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:22:32 -0500 Subject: [PATCH] ca_yt: Add comment about Cloudflare bot products --- ca_yt/people.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ca_yt/people.py b/ca_yt/people.py index ae18f27d..293c8690 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -5,22 +5,20 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://yukonassembly.ca/mlas" -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0.1 Safari/605.1.15' -COOKIES = { - '__cf_bm': 'F6Hu6MMBLKVvWHRnv4jMKjzC6rPO.eZiP7e2wFmDDuk-1731447448-1.0.1.1-rOXfHAF4pu2oOjWi79k_ktxvpxutL0x.BKYzcxgqooaC0mZe.oRHqJe_bLzTcFHixlhjd4luXPSxO9kv08_7vw' -} +# This website uses Cloudflare bot products (setting a __cf_bm cookie), which is hard to circumvent. +# https://developers.cloudflare.com/fundamentals/reference/policies-compliances/cloudflare-cookies/ class YukonPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, cookies=COOKIES, user_agent=USER_AGENT) + page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//*[@id="block-views-block-members-listing-block-1"]/div/div/div[2]/div') assert len(members), "No members found" for member in members: if "Vacant" not in member.xpath("./div/span")[0].text_content(): url = member.xpath("./div/span/a/@href")[0] - page = self.lxmlize(url, cookies=COOKIES, user_agent=USER_AGENT) + page = self.lxmlize(url) name = page.xpath("//html/body/div[1]/div/div/section/div[2]/article/div/h1/span/span")[ 0 ].text_content()