|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# date: 2019.12.12 |
| 4 | +# https://stackoverflow.com/questions/59259699/scrapy-formrequest-parameter-not-working-but-showing-all-result-instead/ |
| 5 | +# page: https://researchgrant.gov.sg/eservices/advanced-search/ |
| 6 | + |
| 7 | +import scrapy |
| 8 | +import urllib.parse |
| 9 | + |
| 10 | +class MySpider(scrapy.Spider): |
| 11 | + |
| 12 | + name = 'myspider' |
| 13 | + #allowed_domains = [] |
| 14 | + |
| 15 | + params = { |
| 16 | + 'name': 'advancesearchawardedprojectsp' |
| 17 | + } |
| 18 | + |
| 19 | + args = { |
| 20 | + 'keyword': '', |
| 21 | + 'source': 'sharepoint', |
| 22 | + 'type': 'project', |
| 23 | + 'status': 'open', |
| 24 | + 'page': 1, |
| 25 | + '_pp_projectstatus': '', |
| 26 | + |
| 27 | + #'_pp_hiname': 'tan', |
| 28 | + #'_pp_piname': '', |
| 29 | + '_pp_hiname': 'ab', |
| 30 | + '_pp_piname': '', #'pua', |
| 31 | + |
| 32 | + '_pp_source': '', |
| 33 | + '_pp_details': '', |
| 34 | + } |
| 35 | + |
| 36 | + def start_requests(self): |
| 37 | + |
| 38 | + # create request for first page |
| 39 | + args = urllib.parse.urlencode(self.args) |
| 40 | + |
| 41 | + url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args |
| 42 | + |
| 43 | + yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) |
| 44 | + |
| 45 | + |
| 46 | + def parse_item(self,response): |
| 47 | + #print('parse_item] url:', response.url) |
| 48 | + #print('parse_item] text:', response.text) |
| 49 | + |
| 50 | + #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'): |
| 51 | + # for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'): |
| 52 | + # link = row.xpath('td[1]/a/@href').extract_first() |
| 53 | + # yield scrapy.Request(link, callback=self.parse_product) |
| 54 | + |
| 55 | + for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'): |
| 56 | + cols = row.xpath('.//td') |
| 57 | + link = cols[0].xpath('.//a/@href').get().strip() |
| 58 | + title = cols[0].xpath('.//a/text()').get().strip() |
| 59 | + status = cols[1].xpath('.//text()').get().strip() |
| 60 | + pi = cols[2].xpath('.//text()').get().strip() |
| 61 | + hi = cols[3].xpath('.//text()').get().strip() |
| 62 | + date = cols[4].xpath('.//text()').get().strip() |
| 63 | + |
| 64 | + item = { |
| 65 | + #'id': project_id, |
| 66 | + 'status': status, |
| 67 | + 'title': title, |
| 68 | + 'link': link, |
| 69 | + 'pi': pi, |
| 70 | + 'hi': hi, |
| 71 | + 'date': date, |
| 72 | + } |
| 73 | + |
| 74 | + # few links are redirected to main page so they are filtered and it needs `dont_filter=True` |
| 75 | + yield scrapy.Request(link, meta={'item': item}, callback=self.parse_product, dont_filter=True) |
| 76 | + |
| 77 | + # create request for next page |
| 78 | + onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get() |
| 79 | + |
| 80 | + if onclick: |
| 81 | + # next page |
| 82 | + self.args['page'] += 1 |
| 83 | + args = urllib.parse.urlencode(self.args) |
| 84 | + url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args |
| 85 | + yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) |
| 86 | + |
| 87 | + def parse_product(self, response): |
| 88 | + #print('parse_product] url:', response.url) |
| 89 | + item = response.meta['item'] |
| 90 | + |
| 91 | + # .extract_first() or .get() instead of .extract() |
| 92 | + project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get() |
| 93 | + #title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get() |
| 94 | + #pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get() |
| 95 | + #hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get() |
| 96 | + #date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get() |
| 97 | + # etc. |
| 98 | + item['id'] = project_id |
| 99 | + |
| 100 | + yield item |
| 101 | + |
| 102 | +# --- run without project and save in `output.csv` --- |
| 103 | + |
| 104 | +from scrapy.crawler import CrawlerProcess |
| 105 | + |
| 106 | +c = CrawlerProcess({ |
| 107 | + 'USER_AGENT': 'Mozilla/5.0', |
| 108 | + # save in file CSV, JSON or XML |
| 109 | + 'FEED_FORMAT': 'csv', # csv, json, xml |
| 110 | + 'FEED_URI': 'output.csv', # |
| 111 | +}) |
| 112 | +c.crawl(MySpider) |
| 113 | +c.start() |
0 commit comments