#The first thing we will do is create a project with the tutorial code #Go to File -> New Project and (assuming this file here is in a meaningful directory) -> Existing Directory #Also install the library rvest by going to Tools -> Install Packages -> and adding rvest as the name # -------- Loading and Saving Webpages -------- # wikipedia_fr_root <- 'https://fr.wikipedia.org/wiki/' # Load the fr.wikipedia.org page with a given page_name and # save it as an html file with that same name. fetch_and_save_wiki_page <- function(page_name){ full_url <- paste(wikipedia_fr_root, page_name,sep="") directory <- "scraped_wikipages" if (!file.exists(directory)){ dir.create(file.path(directory)) } # And save the file there file_name = paste('scraped_wikipages','/',page_name,'.html',sep="") url_open = tryCatch({ download.file(full_url,file_name) }, warning = function(w){ #print("a warning was issued when opening the website") #print(w) }, error = function(e){ print(e) }) print(paste('saved ',file_name)) } # [TRY IT] Try saving a file by calling this: # fetch_and_save_wiki_page('chat') # -------- Parsing Webpages -------- # # load rvest # (You will need to have installed it first. In R Studio go to Tools -> Install Packages # Documentation for rvest: https://cran.r-project.org/web/packages/rvest/index.html library(rvest) # This function opens a saved file and uses rvest to extract the title of the page get_wiki_page_title <- function(file_name){ html <- read_html(file_name) #the %>% is called a magrittr forward-pipe operator (find help in R by typing ?"%>%" in your console) html %>% html_nodes("title") %>% html_text() } #[TRY IT] Try extracting the title by calling: # title = get_wiki_page_title('scraped_wikipages/chat.html') # print(title) # Opens a saved Wikipedia page and extracts the InfoBox get_wiki_info_box <- function(file_name){ html <- read_html(file_name) html_node(html,"div.infobox_v3") } # [TRY IT] Try extracting an infobox by calling: # info_box_html <- get_wiki_info_box('scraped_wikipages/chat.html') # html_text(info_box_html.prettify()) # -------- Saving Results to a CSV file -------- # # Loads a saved Wikipedia page and saves all the links to a CSV file process_wiki_links_to_csv <- function(file_name){ # Read in the file and parse the html html <- read_html(file_name) # Find all the links in the document all_links <- html_nodes(html,'a') #get all hrefs hrefs <- html_attr(all_links,"href") #get all link texts link_texts <- html_text(all_links) # Build a data frame (a "table") with info for each link df <- data.frame(hrefs, link_texts, stringsAsFactors=FALSE) csv_file_name <- paste(file_name,"_links.csv",sep="") write.csv(df, csv_file_name) print(paste("Saved",csv_file_name)) } # [TRY IT] Try saving all of the links as a CSV file # process_wiki_links_to_csv('scraped_wikipages/chat.html') # This code is run if we execute the script by clicking on the source button in the top right of this window in RStudio # It will fetch and then process a wikipedia page and save a CSV with its links. fetch_and_save_wiki_page('mouton') process_wiki_links_to_csv('scraped_wikipages/mouton.html')