I'm crawling through some directories with ASP.NET programming via Scrapy.
The pages to crawl through are encoded as such:
javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')
where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only scrapy each link within the first page.
# -*- coding: utf-8 -*- import scrapy from bs4 import BeautifulSoup import re from scrapy.http import FormRequest import js2xml import requests from datetime import datetime HEADERS = { 'X-MicrosoftAjax': 'Delta=true', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36' } URL = 'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001' class nnggzySpider(scrapy.Spider): name = 'nnggzygov' start_urls = [ 'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001' ] base_url = 'https://www.nnggzy.org.cn' custom_settings = { 'LOG_LEVEL': 'ERROR' } def parse(self, response): self.data = {} soup = BeautifulSoup(response.body, 'html.parser') tags = soup.find_all('a', href=re.compile(r"InfoDetail")) ##pages = soup.select('font b')[1].text for tag in tags: url = nnggzySpider.base_url + tag.get('href') yield scrapy.Request(url, callback=self.parse_details) def parse_details(self, response): soup = BeautifulSoup(response.body, 'html.parser') selectors = ['td#TDContent.infodetail div div.MsoNormal font span'] for selector in selectors: agent_name = soup.select(selector)[31].text[13:] project_name = soup.select(selector)[3].text start_time = soup.select(selector)[20].text[9:] budget = re.sub("\D", "", soup.select(selector)[4].text) id = soup.select(selector)[2].text[1:] print(id, agent_name, project_name, start_time, budget)
Can anyone help me with this?
https://stackoverflow.com/questions/66575494/scrapy-crawling-through-pages-with-postback-data-javascript-url-doesnt-change March 11, 2021 at 10:06AM
没有评论:
发表评论