I am new in the world of scraping and python. I have this code
import pandas as pd import requests from bs4 import BeautifulSoup import datetime #page = requests.get('https://www.amazon.es/s?i=computers&bbn=937925031&rh=n%3A937925031%2Cp_36%3A6000-45000%2Cp_n_feature_two_browse-bin%3A12783798031%7C12783804031%2Cp_72%3A831280031%2Cp_89%3AAMD%2Cp_n_condition-type%3A15144009031&s=relevancerank&dc&hidden-keywords=AMD&pf_rd_i=937912031&pf_rd_m=A1AT7YVPFBWXBL&pf_rd_p=df2457c3-6814-4d25-b8d4-2c8634f3b639%2Cdf2457c3-6814-4d25-b8d4-2c8634f3b639&pf_rd_r=5QVCN0DS2Q82FPH17RH1%2C5QVCN0DS2Q82FPH17RH1&pf_rd_s=merchandised-search-leftnav&pf_rd_t=101&qid=1609806932&rnid=15144007031&ref=sr_nr_p_n_condition-type_1',headers={"User-Agent":"Defined"}) page = requests.get('https://www.amazon.es/s?i=computers&bbn=937924031&rh=n%3A937924031%2Cp_n_feature_two_browse-bin%3A12783798031%7C12783804031%2Cp_72%3A831280031%2Cp_n_feature_keywords_two_browse-bin%3A4700044031%2Cp_n_condition-type%3A15144009031%2Cp_n_availability%3A831279031&dc&qid=1609807162&rnid=831270031&ref=sr_nr_p_n_availability_2',headers={"User-Agent":"Defined"}) print(page.status_code) # page = [ # ] soup = BeautifulSoup(page.content,'html.parser') #print(soup) product = soup.find(class_='s-main-slot s-result-list s-search-results sg-row') #print(product) items = product.find_all(class_='a-section a-spacing-medium') #a-section a-spacing-none a-spacing-top-small print(items[0]) # print(items[0].find(class_ = 'a-size-medium a-color-base a-text-normal').get_text()) # print(items[0].find(class_ = 'a-price-whole').get_text()) # print(items[0].find(class_ = 'a-size-base').get_text()) # print(items[0].find(class_ = 'a-icon a-icon-star-small a-star-small-4-5 aok-align-bottom').get_text()) product_name = [item.find(class_ = 'a-size-medium a-color-base a-text-normal').get_text() if item.find(class_ = 'a-size-medium a-color-base a-text-normal') != None else None for item in items] price = [item.find(class_ = 'a-price-whole').get_text() if item.find(class_ = 'a-price-whole') != None else None for item in items] # availability = [item.find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text() for item in items] rating = [item.find(class_ = 'a-icon a-icon-star-small a-star-small-4-5 aok-align-bottom').get_text() if item.find(class_ = 'a-icon a-icon-star-small a-star-small-4-5 aok-align-bottom') != None else None for item in items] opinion = [item.find(class_ = 'a-size-base').get_text() if item.find(class_ = 'a-size-base') != None else None for item in items] # # print(product_name) # # print(price) # # print(availability) # # print(rating) # # print(opinion) store = 'Amazon' extraction_date = datetime.datetime.now() df = pd.DataFrame ( { 'product_name' : product_name, 'price' : price, # 'availability' : availability, 'rating' : rating, 'opinion' : opinion, 'store' : store, 'date_extraction' : extraction_date, }) # # site = 'mysite' path = "C:\\PriceTracking\\amazon\\" # now = datetime.datetime.now() mydate = extraction_date.strftime('%Y%m%d') mytime = extraction_date.strftime('%H%M%S') filename = path+store+'_'+mydate+'_'+mytime+".csv" #df.to_csv(filename) print(df) Using the page commented always returned the same result but with the second page sometimes it returns the product_name correctly and others it returns none.
I'm sure I'm missing something but no idea where to find the the issue.
By the way, the script only return 24 of 85 results, is there an easy way to extract the products remaining?
Thanks all and regards
https://stackoverflow.com/questions/65571901/strange-result-scraping-a-web-with-python-and-beautifulsoup January 05, 2021 at 09:06AM
没有评论:
发表评论