Apologies for the long code, so im basicly doing a webcrawl and outputting the data onto an excel sheet. i have been successful thus far, but when i try to connect it to mysql (mariadb) it is coming up with an error "connection refused".. i have basicly checked every typo to any other mistakes but when running a debug it errors out on the the line , need a helping hand or a hint.. thank you
SQL database setting is attached onto the photo
it errors out on the code line below
engine = create_engine( "mysql+mysqldb://root:abcdefgH1@localhost:3306/stamprallydb}", encoding='utf-8')
i have also tried
engine = create_engine( "mysql+mysqldb://root:abcdefgH1@localhost:50571/stamprallydb}", encoding='utf-8')
Would save time to read from try:
part of the code at the bottom
Error in console
ConnectionRefusedError: [WinError 10061] # This Python file uses the following encoding: utf-8 # 모듈 가져오기 # pip install selenium # pip install bs4 # pip install pymysql # 254 Results took 30Min. 1 POST PER 7 SECONS import pymysql from sqlalchemy import create_engine pymysql.install_as_MySQLdb() import MySQLdb import re import pandas as pd import numpy as np import urllib.request import math import time import sys import os from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium import webdriver as wd from Stamprally import StamprallyInfo # Start Recording Program Loading Time programStart = time.time() # ============================================================================================= # prefectureNameList = ["海外", "北海道地方", "北海道", "東北地方", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "関東地方", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "中部地方", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県","愛知県", "三重県", "近畿地方", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "中国地方", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "四国地方", "徳島県", "香川県", "愛媛県", "高知県", "九州・沖縄地方", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"] # ============================================================================================= prefectureNameList = ["北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県", "愛知県", "三重県", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"] #prefectureNameList = ["東京都"] data = [] timeStampData = [] #timeStampDataAggregate = [] contentAggregator = [] timeStampData.append("프로그램 시작") timeStampData.append(programStart) # db = Db() main_url = 'https://stamprally.org/' # 스탬프랠리 정보를 담는 리스트 # stamprally_list = [] # 드라이브 로드 # For MAC users driver = wd.Chrome(executable_path='./chromedriver') # End Recording Program Loading Time programEnd = time.time() timeStampData.append(programEnd - programStart) # timeStampDataAggregate.append(timeStampData) timeStamp = pd.DataFrame(np.array([timeStampData]), columns=[ '설명', 'TimeStamp', '소요기간']) # tmp_res = [[]] # tmp_res = [][] # timeStamp = pd.DataFrame(tmp_res[0].append(timeStampData), columns=[ # '설명', 'TimeStamp', '소요기간']) print(timeStamp) timeStampData.clear() print(timeStampData) # Start Recording Selenium Driver Loading print("셀레니엄 ") timeStampData.append("셀레니엄 드라이버 로딩") seleniumStart = time.time() timeStampData.append(seleniumStart) driver = wd.Chrome(executable_path='chromedriver.exe') driver.get(main_url) seleniumEnd = time.time() timeStampData.append(seleniumEnd - seleniumStart) print(timeStampData) rowAddTimeStampSelenium = pd.Series(timeStampData, index=timeStamp.columns) timeStamp = timeStamp.append(rowAddTimeStampSelenium, ignore_index=True) # timeStampDataAggregate.append(timeStampData) timeStampData.clear() # wait = WebDriverWait(driver, 10) # searchCat1Options = wait.until(EC.presence_of_all_elements_located( # (By.CSS_SELECTOR, "#header_search_cat1>:is(option.level-1 ,option.level-0)"))) # prefectureValueStorage = driver.find_element_by_xpath( # "//*[@id='header_search_cat1']/option[2]").get_attribute('value') # prefectureValueStorage = [x.get_attribute('value') for x in driver.find_elements_by_xpath( # "//select[@name='search_cat1']/option[@class='level-1' or @class='level-0']")] #prefectureValueStorage = [81] # ============================================================================================= prefectureValueStorage = [x.get_attribute('value') for x in driver.find_elements_by_xpath( "//select[@name='search_cat1']/option[@class='level-1']")] # ============================================================================================= prefectureNameIterator = -1 print("START OF PREFECTURE " + prefectureNameList[prefectureNameIterator]) # Loop through all the different prefectures for prefectureValue in prefectureValueStorage: print("Prefecture Start Time") prefectureStart = time.time() prefectureNameIterator += 1 # Add Prefecture Name to timeStampData timeStampData.append(prefectureNameList[prefectureNameIterator]) timeStampData.append(prefectureStart) print(timeStampData) driver.get( f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0") # Calculate How Many Times To Run Page Loop imageDownloadCounter = 1 totalList = driver.find_element_by_css_selector( 'div.page_navi2.clearfix>p').text # get_attribute('text') totalListNum = totalList.split("件中") # Add TotalListNum to the contentAggregator # contentAggregator.append(int(totalListNum[0])) if int(totalListNum[0]) % 10 != 0: pageLoopCount = math.ceil((int(totalListNum[0])/10)) else: pageLoopCount = int(totalListNum[0])/10 # continue currentpage = 0 while currentpage < pageLoopCount: currentpage += 1 print("Current Page " + str(currentpage)) # ======================================================================================================================================================== # # Loop through all the Listings within the prefecture page driver.get( f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0&paged={currentpage}") # print("Loading Page %s" % currentpage) # ======================================================================================================================================================== # Add prefectureName to the contentAggregator # contentAggregator.append(prefectureNameList[prefectureNameIterator]) # Gather All List Links urlList = [] currentUrlCounter = 0 listURLContainer = driver.find_elements_by_css_selector( '#post_list2 > li > a') # Put all the lists in one Array for url in listURLContainer: urlList.append(url.get_attribute('href')) # Loop through all the links for listURL in listURLContainer: contentAggregator = [] # Add TotalListNum to the contentAggregator contentAggregator.append(int(totalListNum[0])) # Add prefectureName to the contentAggregator contentAggregator.append( prefectureNameList[prefectureNameIterator]) #print('article Link: ') # print(urlList[currentUrlCounter]) # Add listLink to the contentAggregator contentAggregator.append( urlList[currentUrlCounter]) # for Each Links in listURLContainer: driver.get(urlList[currentUrlCounter]) currentUrlCounter += 1 locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath( "//*[@id='post_meta_top']/li[1]/a[@class='cat-category']")] # print(locationTag) # Add locationTag to the contentAggregator contentAggregator.append(locationTag) eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath( "//*[@id='post_meta_top']/li[2]/a[@class='cat-category2']")] # Add eventTag to the contentAggregator contentAggregator.append(eventTag) # Select, Post Date & Remove Non-Text Variable availablePeriod = (driver.find_element_by_css_selector( 'div#post_date')).text.split("( ") availablePeriodFormatted = availablePeriod[0].replace("開催期間:", "") availableStartDate = availablePeriod[0].split(" ~ ") endDate = availableStartDate[1] availableStartDateFormatted = availableStartDate[0].replace( "開催期間:", "") # Select Latest Update Date lastUpdatedDate = driver.find_element_by_css_selector( 'time.entry-date.updated').text #print("Available Period:") # print(availablePeriodFormatted) # Add Available Period to the contentAggregator contentAggregator.append(availablePeriodFormatted) #print("Available StartDate:") # print(availableStartDateFormatted) # Add Available StartDate to the contentAggregator contentAggregator.append(availableStartDateFormatted) #print("End Date: ") # print(endDate) # Add endDate to the contentAggregator contentAggregator.append(endDate) #print("Last Updated:") # print(lastUpdatedDate[6:]) # Add lastUpdatedDate to the contentAggregator contentAggregator.append(lastUpdatedDate[6:]) # ======================================================================================================================================================== # Download Main Post Image mainImageUrl = driver.find_element_by_css_selector( 'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src') # Add lastUpdatedDate to the contentAggregator contentAggregator.append(mainImageUrl) # Save Post Main Title # postTitle = driver.find_element_by_xpath( # "//*[@id='post_title']/text()") postTitle1 = driver.find_element_by_css_selector( 'h2#post_title').text.replace("開催終了", "") postTitle = postTitle1.replace("ただいま開催中", "") # Replace all special characters in a string with empty string # Pass the string in search # specialCharacter = "!@#$%^&*()-+?_=,<>/" # if any(character in specialCharacter for character in postTitle): removeSpecialChars = postTitle.translate( {ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) postTitle = removeSpecialChars print(postTitle) # else: # continue # Add Title to the contentAggregator contentAggregator.append(postTitle) #print("Title: ") # print(postTitle) # Find out Event validty status eventValidity = driver.find_element_by_xpath( "//*[@id='post_title']/span").text # Add eventValidity to the contentAggregator contentAggregator.append(eventValidity) # Save Post Main Image urllib.request.urlretrieve(mainImageUrl, (str( prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png")) imageDownloadCounter += 1 # Get Inner Website Link innerWebSiteButtonURL = driver.find_element_by_css_selector( 'div.post_content.clearfix > div >a').get_attribute('href') #print("inner Website Button URL: " + innerWebSiteButtonURL) # Add innerWebSiteURL to the contentAggregator contentAggregator.append(innerWebSiteButtonURL) # Gather Main Post Text Content mainText = driver.find_elements_by_css_selector( 'div.post_content.clearfix > p') mainContentText = [] # Remove Disclamimer text for mainContentDetail in mainText: mainContentText.append(mainContentDetail.text) mainContextTextCount = len(mainContentText)-1 # print(mainContentText[:mainContextTextCount]) # Add Main Post Text Content to the contentAggregator contentAggregator.append(mainContentText[:mainContextTextCount]) # ======================================================================================================================================================== contentReorder = [1, 0, 10, 11, 5, 6, 7, 8, 13, 3, 4, 9, 12, 2] contentAggregator = [contentAggregator[i] for i in contentReorder] print("=====================================================================================================================================================") # print(contentAggregator) data.append(contentAggregator) # print(data) # print(pd.DataFrame(data, columns=["Total List Number", "Prefecture", "ListLink", "Location Tag", "Event Tag", "Available Period", # "Available StartDate", "End Date", "Last Updated", "Main Image URL", "Title","Event Validty" "innerWebSiteURL", "mainText"])) df = pd.DataFrame(data, columns=["Prefecture", "Total List Number", "Title", "Event Validity", "Available Period", "Available StartDate", "End Date", "Last Updated", "mainText", "Location Tag", "Event Tag", "Main Image URL", "innerWebSiteURL", "ListLink"]) try: engine = create_engine( "mysql+mysqldb://root:abcdefgH1@localhost:3306/stamprallydb}", encoding='utf-8') conn = engine.connect() df.to_sql(name='stamprallydb_crawl_result', con=engine, if_exists='append', index=False) except: print("Error in Mysql connection") else: prefectureEnd = time.time() timeStampData.append(prefectureEnd-prefectureStart) rowAddTimeStampPrefecture = pd.Series( timeStampData, index=timeStamp.columns) timeStamp = timeStamp.append( rowAddTimeStampPrefecture, ignore_index=True) timeStampData.clear() # timeStampDataAggregate.append(timeStampData) excelFileStart = time.time() xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx') df.to_excel(xlwriter, sheet_name="Stamprally.org Crawl Result") # Add last Series "엑셀 파일 저장" excelFileEnd = time.time() timeStampData.append("엑셀 파일 저장") timeStampData.append(excelFileStart) timeStampData.append(excelFileEnd-excelFileStart) rowAddTimeStampPrefecture = pd.Series(timeStampData, index=timeStamp.columns) timeStamp = timeStamp.append(rowAddTimeStampPrefecture, ignore_index=True) # timeStampDataAggregate.append(timeStampData) # Create New sheet and write to Excel timeStamp.to_excel(xlwriter, sheet_name="TimeStamp Result") xlwriter.close() # dictionary = {'Prefecture': ['Jai', 'Princi', 'Gaurav', 'Anuj'], # 'Title': [5.1, 6.2, 5.1, 5.2], # 'AvailablePeriod': ['Msc', 'MA', 'Msc', 'Msc'] # 'AvailableDate': ['Msc', 'MA', 'Msc', 'Msc'] # 'AvailableEndDate': ['Msc', 'MA', 'Msc', 'Msc'] # 'LastUpdateDate': ['Msc', 'MA', 'Msc', 'Msc'] # 'Maintext': ['Msc', 'MA', 'Msc', 'Msc'] # 'PostLink': ['Msc', 'MA', 'Msc', 'Msc'] # 'InnerWebsiteLink': ['Msc', 'MA', 'Msc', 'Msc'] # 'LocationTag': ['Msc', 'MA', 'Msc', 'Msc'] # 'EventTag': ['Msc', 'MA', 'Msc', 'Msc'] # } # ======================================================================================================================================================== # Close Off driver.close() driver.quit() sys.exit()
https://stackoverflow.com/questions/67393862/connectionrefusederror-winerror-10061-cannot-transfer-to-sql-from-dataframe May 05, 2021 at 08:48AM
没有评论:
发表评论