I have a python bot for email extraction which takes one query and google search using this single query and then returns a list of found emails using the query, I need to consider the result as one, and use for loop for multiple queries and insert the list of found emails as a string into each cell. Please any idea how this can be done. This is the scrapping code
import scrapy from scrapy.spiders import CrawlSpider, Request from googlesearch import search import re from scrapy_selenium import SeleniumRequest from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC # create class to extract email ids class email_extractor(CrawlSpider): # adjusting parameters name = 'email_ex' def __init__(self, *args, **kwargs): super(email_extractor, self).__init__(*args, **kwargs) self.email_list = [] **self.query = " 'Students with infectious disease, Rome, Italy' "** def start_requests(self): for results in search(self.query, num=10, stop=10, pause=2): yield SeleniumRequest( url=results, callback=self.parse, wait_until=EC.presence_of_element_located( (By.TAG_NAME, "html")), dont_filter=True ) # extracting emails def parse(self, response): EMAIL_REGEX = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+' emails = re.finditer(EMAIL_REGEX, str(response.text)) for email in emails: self.email_list.append(email.group()) for email in set(self.email_list): yield{ "emails": email } self.email_list.clear() Currently, this work for a single query, want to loop over multiple queries, any idea please?
https://stackoverflow.com/questions/66574927/guide-using-for-loop-on-python-web-scrapping-class March 11, 2021 at 08:41AM
没有评论:
发表评论