2021年3月10日星期三

Scrapy crawling through pages with PostBack data javascript url doesn't change

I'm crawling through some directories with ASP.NET programming via Scrapy.

The pages to crawl through are encoded as such:

javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')  

where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only scrapy each link within the first page.

# -*- coding: utf-8 -*-  import scrapy  from bs4 import BeautifulSoup  import re  from scrapy.http import FormRequest  import js2xml  import requests  from datetime import datetime    HEADERS = {      'X-MicrosoftAjax': 'Delta=true',      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36'  }    URL = 'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'    class nnggzySpider(scrapy.Spider):        name = 'nnggzygov'      start_urls = [          'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'      ]        base_url = 'https://www.nnggzy.org.cn'        custom_settings = {          'LOG_LEVEL': 'ERROR'      }        def parse(self, response):          self.data = {}          soup = BeautifulSoup(response.body, 'html.parser')          tags = soup.find_all('a', href=re.compile(r"InfoDetail"))          ##pages = soup.select('font b')[1].text          for tag in tags:              url = nnggzySpider.base_url + tag.get('href')              yield scrapy.Request(url, callback=self.parse_details)        def parse_details(self, response):          soup = BeautifulSoup(response.body, 'html.parser')          selectors = ['td#TDContent.infodetail div div.MsoNormal font span']          for selector in selectors:              agent_name = soup.select(selector)[31].text[13:]              project_name = soup.select(selector)[3].text              start_time = soup.select(selector)[20].text[9:]              budget = re.sub("\D", "", soup.select(selector)[4].text)              id = soup.select(selector)[2].text[1:]              print(id, agent_name, project_name, start_time, budget)  

Can anyone help me with this?

https://stackoverflow.com/questions/66575494/scrapy-crawling-through-pages-with-postback-data-javascript-url-doesnt-change March 11, 2021 at 10:06AM

没有评论:

发表评论