2021年3月10日星期三

Guide using for loop on python web scrapping class

I have a python bot for email extraction which takes one query and google search using this single query and then returns a list of found emails using the query, I need to consider the result as one, and use for loop for multiple queries and insert the list of found emails as a string into each cell. Please any idea how this can be done. This is the scrapping code

import scrapy  from scrapy.spiders import CrawlSpider, Request  from googlesearch import search  import re  from scrapy_selenium import SeleniumRequest  from selenium.webdriver.common.by import By  from selenium.webdriver.support import expected_conditions as EC    # create class to extract email ids  class email_extractor(CrawlSpider):             # adjusting parameters      name = 'email_ex'         def __init__(self, *args, **kwargs):          super(email_extractor, self).__init__(*args, **kwargs)          self.email_list = []          **self.query = " 'Students with infectious disease, Rome, Italy' "**        def start_requests(self):          for results in search(self.query, num=10, stop=10, pause=2):              yield SeleniumRequest(                  url=results,                  callback=self.parse,                  wait_until=EC.presence_of_element_located(                      (By.TAG_NAME, "html")),                  dont_filter=True              )        # extracting emails      def parse(self, response):          EMAIL_REGEX = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'          emails = re.finditer(EMAIL_REGEX, str(response.text))          for email in emails:              self.email_list.append(email.group())             for email in set(self.email_list):              yield{                  "emails": email              }             self.email_list.clear()  

Currently, this work for a single query, want to loop over multiple queries, any idea please?

https://stackoverflow.com/questions/66574927/guide-using-for-loop-on-python-web-scrapping-class March 11, 2021 at 08:41AM

没有评论:

发表评论