I have a python code to extract data from a website and write it to a csv file. The code works fine but now I would like to iterate a list of webpages to collect more data with the same structure.
My code is:
import pandas as pd import requests from bs4 import BeautifulSoup import datetime # page = requests.get('https://www.pccomponentes.com/procesadores/amd/socket-am4') # page = requests.get('https://www.pccomponentes.com/placas-base/amd-x570/atx') url_list = [ 'https://www.pccomponentes.com/procesadores/socket-am4' 'https://www.pccomponentes.com/discos-duros/500-gb/conexiones-m-2/disco-ssd/internos' 'https://www.pccomponentes.com/discos-duros/1-tb/conexiones-m-2/disco-ssd/internos' 'https://www.pccomponentes.com/placas-base/amd-b550/atx' 'https://www.pccomponentes.com/placas-base/amd-x570/atx' 'https://www.pccomponentes.com/memorias-ram/16-gb/kit-2x8gb' 'https://www.pccomponentes.com/ventiladores-cpu' 'https://www.pccomponentes.com/fuentes-alimentacion/850w/fuente-modular' 'https://www.pccomponentes.com/fuentes-alimentacion/750w/fuente-modular' 'https://www.pccomponentes.com/cajas-pc/atx/con-ventana/sin-ventana' ] for link in url_list: r = requests.get(link) # r.encoding = 'utf-8' # html_content = r.text # soup = BS(html_content, 'lxml') # table = soup.find('table', class_='bigborder') soup = BeautifulSoup(page.content,'html.parser') #print(soup) product = soup.find(id = 'articleListContent') #print(product) items = product.find_all(class_='c-product-card__content') #print(items[0]) # print(items[0].find(class_ = 'c-product-card__header').get_text()) # print(items[0].find(class_ = 'c-product-card__prices cy-product-price').get_text()) # print(items[0].find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text()) # print(items[0].find(class_ = 'c-star-rating__text cy-product-text').get_text()) # print(items[0].find(class_ = 'c-star-rating__text cy-product-rating-result').get_text()) product_name = [item.find(class_ = 'c-product-card__header').get_text() for item in items] price = [item.find(class_ = 'c-product-card__prices cy-product-price').get_text() for item in items] # availability = [item.find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text() for item in items] rating = [item.find(class_ = 'c-star-rating__text cy-product-text').get_text() for item in items] opinion = [item.find(class_ = 'c-star-rating__text cy-product-rating-result').get_text() for item in items] # print(product_name) # print(price) # print(availability) # print(rating) # print(opinion) store = 'PCComponentes' extraction_date = datetime.datetime.now() data_PCCOMP = pd.DataFrame ( { 'product_name' : product_name, 'price' : price, # 'availability' : availability, 'rating' : rating, 'opinion' : opinion, 'store' : store, 'date_extraction' : extraction_date, }) # site = 'mysite' path = "C:\PriceTracking\pccomp\\" # now = datetime.datetime.now() mydate = extraction_date.strftime('%Y%m%d') mytime = extraction_date.strftime('%H%M%S') filename = path+store+'_'+mydate+'_'+mytime+".csv" data_PCCOMP.to_csv(filename) #print(data_PCCOMP)
How can I iterate in order to insert all data from the urls in the same csv?
Any help would be much appreciated.
https://stackoverflow.com/questions/65535582/web-scraping-a-list-of-pages-from-the-same-website-using-python January 02, 2021 at 10:38AM
没有评论:
发表评论