`library(rvest) library(RCurl) library(XML) library(stringr) #Getting the number of Page getPageNumber <- function(URL){ parsedDocument = read_html(URL) Sort1 <- html_nodes(parsedDocument, 'div') Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")] P <- str_count(html_text(Sort2), pattern = " \\d+\r\n") return(ifelse(length(P) == 0, 0, max(P))) } #Getting all articles based off of their DOI getAllArticles <-function(URL){ parsedDocument = read_html(URL) Sort1 <- html_nodes(parsedDocument,'div') Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")] ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2))) URL3 <- "https://doi.org/10.1093/dnares/" URL4 <- paste(URL3, ArticleDOInumber, sep = "") return(URL4) } Title <- function(parsedDocument){ Sort1 <- html_nodes(parsedDocument, 'h4') Title <- gsub("<a>\\n|\\n</a>","",Sort1) return(Title) } #main function with input as parameter year findURL <- function(year_chosen){ if(year_chosen >= 1994){ noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}") pagesURl = "&fl_SiteID=5275&page=" URL = paste(noYearURL, pagesURl, sep = "") #URL is working with parameter year_chosen Page <- getPageNumber(URL) if(Page == 5){ Page2 <- 0 while(Page < Page2 | Page != Page2){ Page <- Page2 URL3 <- paste(URL, Page-1, sep = "") Page2 <- getPageNumber(URL3) } } R_Data <- data.frame() for(i in 0:ifelse((Page-1) > 0, (Page-1), 0)){ URL2 <- getAllArticles(paste(URL, i, sep = "")) for(j in 1:(length(URL2))){ parsedDocument <- read_html(URL2[j]) print(URL2[j]) R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE) R_Data <- rbind(R_Data, R) } } write.csv(R_Data, "Group4.csv", row.names = FALSE, sep = "\t") } else { print("The Year you provide is out of range, this journal only contain articles from 2005 to present") } } findURL(2000)` So I am Trying to scrape a website for a given year and inside my main function I try to loop through different pages. Extracting just the title of each article.
I keep getting this error -> Error in open.connection(x, "rb") : HTTP error 404
Some years have only 3 pages so I can see why there may be an error for that, but mostly all have articles have at 5 pages of journals.
After scraping the journals by year I want to write out the scraped titles onto a civ file .
Thank you in advance for the help!
https://stackoverflow.com/questions/66756094/keep-getting-error-in-r-error-in-open-connectionx-rb-http-error-404 March 23, 2021 at 09:40AM
没有评论:
发表评论