#The first thing we will do is create a project with the tutorial code
#Go to File -> New Project and (assuming this file here is in a meaningful directory) -> Existing Directory

#Also install the library rvest by going to Tools -> Install Packages -> and adding rvest as the name

# -------- Loading and Saving Webpages -------- #

wikipedia_fr_root <- 'https://fr.wikipedia.org/wiki/'


# Load the fr.wikipedia.org page with a given page_name and 
# save it as an html file with that same name.
fetch_and_save_wiki_page <- function(page_name){
  
  full_url <- paste(wikipedia_fr_root, page_name,sep="")
  
  directory <- "scraped_wikipages"
  if (!file.exists(directory)){
    dir.create(file.path(directory))
  }
  
  # And save the file there
  file_name = paste('scraped_wikipages','/',page_name,'.html',sep="")
  
  url_open = tryCatch({
    download.file(full_url,file_name)
  }, warning = function(w){
    #print("a warning was issued when opening the website")  
    #print(w)
  }, error = function(e){
    print(e)
  }) 
  
  print(paste('saved ',file_name))
  
}


# [TRY IT] Try saving a file by calling this:
# fetch_and_save_wiki_page('chat')



# -------- Parsing Webpages -------- #

# load rvest
# (You will need to have installed it first. In R Studio go to Tools -> Install Packages
# Documentation for rvest: https://cran.r-project.org/web/packages/rvest/index.html
library(rvest)


# This function opens a saved file and uses rvest to extract the title of the page
get_wiki_page_title <- function(file_name){
  html <- read_html(file_name)
  #the %>% is called a magrittr forward-pipe operator (find help in R by typing ?"%>%" in your console)
  html %>% html_nodes("title") %>% html_text()
}

#[TRY IT]  Try extracting the title by calling:
# title = get_wiki_page_title('scraped_wikipages/chat.html')
# print(title)


# Opens a saved Wikipedia page and extracts the InfoBox
get_wiki_info_box <- function(file_name){
  html <- read_html(file_name)
  html_node(html,"div.infobox_v3")
}

# [TRY IT] Try extracting an infobox by calling:
# info_box_html <- get_wiki_info_box('scraped_wikipages/chat.html')
# html_text(info_box_html.prettify())


# -------- Saving Results to a CSV file -------- #


# Loads a saved Wikipedia page and saves all the links to a CSV file

process_wiki_links_to_csv <- function(file_name){
  # Read in the file and parse the html
  html <- read_html(file_name)
  
  # Find all the links in the document
  all_links <- html_nodes(html,'a')
  
  #get all hrefs
  hrefs <- html_attr(all_links,"href")
  
  #get all link texts
  link_texts <- html_text(all_links)
  
  # Build a data frame  (a "table") with info for each link
  df <- data.frame(hrefs,
                   link_texts, 
                   stringsAsFactors=FALSE) 
  
  csv_file_name <- paste(file_name,"_links.csv",sep="")
  
  write.csv(df, csv_file_name)
  print(paste("Saved",csv_file_name))
}

# [TRY IT] Try saving all of the links as a CSV file
# process_wiki_links_to_csv('scraped_wikipages/chat.html')


# This code is run if we execute the script by clicking on the source button in the top right of this window in RStudio
# It will fetch and then process a wikipedia page and save a CSV with its links.

fetch_and_save_wiki_page('mouton')
process_wiki_links_to_csv('scraped_wikipages/mouton.html')