有些事如何做: Scrapy crawling through pages with PostBack data javascript url doesn't change

I'm crawling through some directories with ASP.NET programming via Scrapy.

The pages to crawl through are encoded as such:

javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')

where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only scrapy each link within the first page.

# -*- coding: utf-8 -*-  import scrapy  from bs4 import BeautifulSoup  import re  from scrapy.http import FormRequest  import js2xml  import requests  from datetime import datetime    HEADERS = {      'X-MicrosoftAjax': 'Delta=true',      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36'  }    URL = 'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'    class nnggzySpider(scrapy.Spider):        name = 'nnggzygov'      start_urls = [          'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'      ]        base_url = 'https://www.nnggzy.org.cn'        custom_settings = {          'LOG_LEVEL': 'ERROR'      }        def parse(self, response):          self.data = {}          soup = BeautifulSoup(response.body, 'html.parser')          tags = soup.find_all('a', href=re.compile(r"InfoDetail"))          ##pages = soup.select('font b')[1].text          for tag in tags:              url = nnggzySpider.base_url + tag.get('href')              yield scrapy.Request(url, callback=self.parse_details)        def parse_details(self, response):          soup = BeautifulSoup(response.body, 'html.parser')          selectors = ['td#TDContent.infodetail div div.MsoNormal font span']          for selector in selectors:              agent_name = soup.select(selector)[31].text[13:]              project_name = soup.select(selector)[3].text              start_time = soup.select(selector)[20].text[9:]              budget = re.sub("\D", "", soup.select(selector)[4].text)              id = soup.select(selector)[2].text[1:]              print(id, agent_name, project_name, start_time, budget)

Can anyone help me with this?

https://stackoverflow.com/questions/66575494/scrapy-crawling-through-pages-with-postback-data-javascript-url-doesnt-change March 11, 2021 at 10:06AM

有些事如何做

2021年3月10日星期三

Scrapy crawling through pages with PostBack data javascript url doesn't change

没有评论:

发表评论