# Visual Analytics 2014-2015 - Tutorial #1
# This file contains examples that will help you complete Assignment 1.
# You can run the entire file from the command line using:
#   python tutorial_scraper_examples.py
# Or you can test individual methods directly by copying and 
# pasting them into the python interpreter.

# -------- Loading and Saving Webpages -------- #

# Import the standard library for loading web pages
import urllib.request as urllib2
import os

wikipedia_fr_root = 'http://fr.wikipedia.org/wiki/'

# Load the fr.wikipedia.org page with a given page_name and
# return the page contents as a string.
def load_wiki_page(page_name):
  full_url = wikipedia_fr_root + page_name
  url_open = urllib2.urlopen(full_url)
  return url_open.read();

# [TRY IT] Now try loading a real page:
# load_wiki_page('chat')
# (or a one that doesn't exist) by calling this:
# load_wiki_page('not_a_real_page')



# Load the fr.wikipedia.org page with a given page_name and
# return the page contents as a string. Returns None if 
# there was a problem loading the page.
def load_wiki_page_check(page_name):
  try:
    loaded_page = load_wiki_page(page_name)
    return loaded_page
  except urllib2.URLError as e:
    print('Error loading page for "' + page_name + '": ' + e.reason)
    return None

# [TRY IT] Now try loading a real page and a fake one by calling this:
# load_wiki_page_check('chat')
# load_wiki_page_check('not_a_real_page')



# Load the fr.wikipedia.org page with a given page_name and 
# save it as an html file with that same name.
def fetch_and_save_wiki_page(page_name):
  page_html = load_wiki_page(page_name)
  if page_html != None:
    # Create a directory if there isn't one aleady
    if not os.path.exists('scraped_wikipages'):
      os.makedirs('scraped_wikipages')
    # And save the file there
    file_name = 'scraped_wikipages' + '/' + page_name + '.html'
    save_file = open(file_name, 'w',encoding='utf-8');
    save_file.write(page_html.decode('utf-8'));
    save_file.close();
    print('saved ' + file_name)

# [TRY IT] Try saving a file by calling this:
# fetch_and_save_wiki_page('chat')


# -------- Parsing Webpages -------- #

# Import Beautiful Soup 
# (You will need to have installed it first. See: http://www.crummy.com/software/BeautifulSoup/)
# Documentation for Beautiful Soup is available here: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup


# This function opens a saved file and uses beautiful soup to extract the title of the page
def get_wiki_page_title(file_name):
  read_file = open(file_name, 'r',encoding='utf-8')
  html = read_file.read()
  soup = BeautifulSoup(html)
  return soup.title

#[TRY IT]  Try extracting the title by calling:
# title = get_wiki_page_title('scraped_wikipages/chat.html')
# print(title)
# print(title.name)
# print(title.string)


# Opens a saved Wikipedia page and extracts the InfoBox
def get_wiki_info_box(file_name):
  read_file = open(file_name, 'r',encoding='utf-8')
  html = read_file.read()
  soup = BeautifulSoup(html)
  return soup.find(class_='infobox_v3')

# [TRY IT] Try extracting an infobox by calling:
# info_box_html = get_wiki_info_box('scraped_wikipages/chat.html')
# print(info_box_html.prettify())


# -------- Saving Results to a CSV file -------- #

# Import Python's CSV writer library
import csv

# Loads a saved Wikipedia page and saves all the links to a CSV file
soup = None
def process_wiki_links_to_csv(file_name):
  # Read in the file
  read_file = open(file_name, 'r',encoding='utf-8')  
  html = read_file.read()
  
  # Have Beautiful Soup parse the HTML
  soup = BeautifulSoup(html)
  
  # Find all the links in the document
  all_links = soup.find_all('a')
  # Build a data structure (a "dictionary") with info for each link
  link_dictionaries_list = []
  for link in all_links:
    try:
      link_dict = {
        'href': link['href'],
        'text': link.string
      }
      link_dictionaries_list.append(link_dict)
    except:
      continue
  print('found ' + str(len(link_dictionaries_list)) + 'links')
  # Save all the links as rows in a CSV file
  csv_file_name = file_name + '.csv'
  with open(csv_file_name, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, ['href','text'])
    writer.writeheader()
    for link_dict in link_dictionaries_list:
      writer.writerow(link_dict)
  print('saved ' + csv_file_name)

# [TRY IT] Try saving all of the links as a CSV file
# process_wiki_links_to_csv('scraped_wikipages/chat.html')


# This code is run if we execute the script directly from the command line.
# It will fetch and then process a wikipedia page and save a CSV with its links.
if __name__ == "__main__":
  fetch_and_save_wiki_page('mouton')
  process_wiki_links_to_csv('scraped_wikipages/mouton.html')