# Visual Analytics 2014-2015 - Tutorial #1 # This file contains examples that will help you complete Assignment 1. # You can run the entire file from the command line using: # python tutorial_scraper_examples.py # Or you can test individual methods directly by copying and # pasting them into the python interpreter. # -------- Loading and Saving Webpages -------- # # Import the standard library for loading web pages import urllib.request as urllib2 import os wikipedia_fr_root = 'http://fr.wikipedia.org/wiki/' # Load the fr.wikipedia.org page with a given page_name and # return the page contents as a string. def load_wiki_page(page_name): full_url = wikipedia_fr_root + page_name url_open = urllib2.urlopen(full_url) return url_open.read(); # [TRY IT] Now try loading a real page: # load_wiki_page('chat') # (or a one that doesn't exist) by calling this: # load_wiki_page('not_a_real_page') # Load the fr.wikipedia.org page with a given page_name and # return the page contents as a string. Returns None if # there was a problem loading the page. def load_wiki_page_check(page_name): try: loaded_page = load_wiki_page(page_name) return loaded_page except urllib2.URLError as e: print('Error loading page for "' + page_name + '": ' + e.reason) return None # [TRY IT] Now try loading a real page and a fake one by calling this: # load_wiki_page_check('chat') # load_wiki_page_check('not_a_real_page') # Load the fr.wikipedia.org page with a given page_name and # save it as an html file with that same name. def fetch_and_save_wiki_page(page_name): page_html = load_wiki_page(page_name) if page_html != None: # Create a directory if there isn't one aleady if not os.path.exists('scraped_wikipages'): os.makedirs('scraped_wikipages') # And save the file there file_name = 'scraped_wikipages' + '/' + page_name + '.html' save_file = open(file_name, 'w',encoding='utf-8'); save_file.write(page_html.decode('utf-8')); save_file.close(); print('saved ' + file_name) # [TRY IT] Try saving a file by calling this: # fetch_and_save_wiki_page('chat') # -------- Parsing Webpages -------- # # Import Beautiful Soup # (You will need to have installed it first. See: http://www.crummy.com/software/BeautifulSoup/) # Documentation for Beautiful Soup is available here: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ from bs4 import BeautifulSoup # This function opens a saved file and uses beautiful soup to extract the title of the page def get_wiki_page_title(file_name): read_file = open(file_name, 'r',encoding='utf-8') html = read_file.read() soup = BeautifulSoup(html) return soup.title #[TRY IT] Try extracting the title by calling: # title = get_wiki_page_title('scraped_wikipages/chat.html') # print(title) # print(title.name) # print(title.string) # Opens a saved Wikipedia page and extracts the InfoBox def get_wiki_info_box(file_name): read_file = open(file_name, 'r',encoding='utf-8') html = read_file.read() soup = BeautifulSoup(html) return soup.find(class_='infobox_v3') # [TRY IT] Try extracting an infobox by calling: # info_box_html = get_wiki_info_box('scraped_wikipages/chat.html') # print(info_box_html.prettify()) # -------- Saving Results to a CSV file -------- # # Import Python's CSV writer library import csv # Loads a saved Wikipedia page and saves all the links to a CSV file soup = None def process_wiki_links_to_csv(file_name): # Read in the file read_file = open(file_name, 'r',encoding='utf-8') html = read_file.read() # Have Beautiful Soup parse the HTML soup = BeautifulSoup(html) # Find all the links in the document all_links = soup.find_all('a') # Build a data structure (a "dictionary") with info for each link link_dictionaries_list = [] for link in all_links: try: link_dict = { 'href': link['href'], 'text': link.string } link_dictionaries_list.append(link_dict) except: continue print('found ' + str(len(link_dictionaries_list)) + 'links') # Save all the links as rows in a CSV file csv_file_name = file_name + '.csv' with open(csv_file_name, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, ['href','text']) writer.writeheader() for link_dict in link_dictionaries_list: writer.writerow(link_dict) print('saved ' + csv_file_name) # [TRY IT] Try saving all of the links as a CSV file # process_wiki_links_to_csv('scraped_wikipages/chat.html') # This code is run if we execute the script directly from the command line. # It will fetch and then process a wikipedia page and save a CSV with its links. if __name__ == "__main__": fetch_and_save_wiki_page('mouton') process_wiki_links_to_csv('scraped_wikipages/mouton.html')