Data Wrangling with MongoDB - Exercises

Q1.1

"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = ""
DATAFILE = "data/745090.csv"


def parse_file(datafile):
    name = ""	
    data = []
    with open(datafile,'rb') as f:
        csvFile = csv.reader(f, delimiter=",", quotechar='"')
        descriptionRow = csvFile.next()  # Description Row that contains the name of the city for this csv file.
        name = descriptionRow[1]  # Pull out name from description Row
        csvFile.next()  # Skip over Header Row
        for row in csvFile:  # Read the rest of the csv data sheet into a list, with each row as a separate list.
            data.append(row)
    return (name, data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)
    
    
    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"
    print "Passed."
    

test()

Passed.

Q 1.2

# Find the time and value of max load for each of the regions
# COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
# and write the result out in a csv file, using pipe character | as the delimiter.
# An example output can be seen in the "example.csv" file.
import xlrd
#import os
#import csv
from zipfile import ZipFile
datafile = "data/2013_ERCOT_Hourly_Load_Data.xls"
outfile = "data/2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall(path="data")


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = []
    headers = sheet.row_values(0, start_colx=0, end_colx=None)

    writer_header = ["Station", "Year", "Month", "Day", "Hour", "Max Load"]
    data.append(writer_header)


    for i in range(1, len(headers)):
        temp_sheet_column = sheet.col_values(i, start_rowx=1, end_rowx=None)
        temp_max = max(temp_sheet_column)
        temp_max_index = temp_sheet_column.index(temp_max) + 1
        raw_max_date = sheet.cell_value(temp_max_index, 0)
        max_date_values = xlrd.xldate_as_tuple(raw_max_date, 0)


        d = [headers[i], max_date_values[0], max_date_values[1], max_date_values[2], max_date_values[3], temp_max]

        data.append(d)

    return data

def save_file(data, filename):
    with open(filename, 'wb') as f:
        writer = csv.writer(f, delimiter='|')
        writer.writerows(data)

    
def test():
    open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    ans = {'FAR_WEST': {'Max Load': "2281.2722140000024", 'Year': "2013", "Month": "6", "Day": "26", "Hour": "17"}}
    
    fields = ["Year", "Month", "Day", "Hour", "Max Load"]
    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            s = line["Station"]
            if s == 'FAR_WEST':
                for field in fields:
                    assert ans[s][field] == line[field]
    print "Passed."

        
test()

Passed.

Q 1.3

"""
This exercise shows some important concepts that you should be aware about:
- using codecs module to write unicode files
- using authentication with web APIs
- using offset when accessing web APIs

To run this code locally you have to register at the NYTimes developer site 
and get your own API key. You will be able to complete this exercise in our UI without doing so,
as we have provided a sample result.

Your task is to process the saved file that represents the most popular (by view count)
articles in the last day, and return the following data:
- list of dictionaries, where the dictionary key is "section" and value is "title"
- list of URLs for all media entries with "format": "Standard Thumbnail"

All your changes should be in the article_overview function.
The rest of functions are provided for your convenience, if you want to access the API by yourself.
"""
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}


def get_from_file(kind, period):
    filename = "data/popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())


def article_overview(kind, period):
    data = get_from_file(kind, period)
    urls = []
    titles = []
    for asset_id in data:
        try:
              # Add article section:title as a dictionary to the list of the titles.
            titles.append({asset_id["section"]: asset_id["title"]})
              # Json python objects are nested dictionaries.
              # Each article entry has 0+ media record entries.
            for media_record in asset_id["media"]:
                  # Each media record has 0+ metadata entries for the media record
                for meta_data_record in media_record["media-metadata"]:
                      # If the format is what we are looking for (Standard Thumbnail)..
                    if meta_data_record["format"] == "Standard Thumbnail":
                          # We grab the url for that format that passed the check.
                        urls.append(meta_data_record["url"])
        except IndexError as err:
            print "ERROR:", err
    return titles, urls


def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print "You need to register for NYTimes Developer account to run this program."
        print "See Intructor notes for information"
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print "Time period can be 1,7, 30 days only"
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print "kind can be only one of viewed/shared/emailed"
        return False

    url = URL_POPULAR + "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data


def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}-full.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))


def test():
    titles, urls = article_overview("viewed", 1)
    assert len(titles) == 20
    assert len(urls) == 30
    assert titles[2] == {'Opinion': 'Professors, We Need You!'}
    assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'
    print "Passed."
    
test()

Passed.

Lesson 2 Exercise

#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    #authors = []
    #for author in root.findall('./fm/bibl/aug/au'):
    #
    #      # Find the appropriate object tag listed under authors.
    #      # Call that objects' .text function to extract the text value.
    #    data = {
    #            "fnm": author.find("fnm").text,
    #            "snm": author.find("snm").text,
    #            "email": author.find("email").text
    #    }
    #
    #
    #   authors.append(data)
    authors = [{"fnm": author.find("./fnm").text, 
                "snm": author.find("./snm").text,
                "email": author.find("./email").text} for author in root.findall('./fm/bibl/aug/au')] 

    return authors


def test():
    solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["fnm"] == solution[1]["fnm"]
    print "Passed."

test()

Passed.

#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()



def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):

          # Find the appropriate object tag listed under authors.
          # Call that objects' .text function to extract the text value.

        all_iid = [insr.get('iid') for insr in author.findall('./insr')]

        data = {
                "fnm": author.find("./fnm").text,
                "snm": author.find("./snm").text,
                "email": author.find("./email").text,
                "insr": all_iid
        }


        authors.append(data)

    return authors


def test():
    solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
                {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
                {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
                {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
                {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
                {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
                {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
                {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["insr"] == solution[1]["insr"]
    print "Passed."


test()

Passed.

Using Beautiful Soup

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the approprate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "data/page_source.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        bs = BeautifulSoup(html)
        event_list = bs.find(id='__EVENTVALIDATION')
        event_view = bs.find(id='__VIEWSTATE')
        data["eventvalidation"] = event_list['value']
        data["viewstate"] = event_view['value']

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")
    print "Passed."

    
test()

Passed.

Q 2.1

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI
# All your changes should be in the 'extract_carrier' function
# Also note that the html file is a stripped down version of what is actually on the website.

# Your task in this exercise is to get a list of all airlines. Exclude all of the combination
# values, like "All U.S. Carriers" from the data that you return.
# You should return a list of codes for the carriers.

from bs4 import BeautifulSoup
html_page = "data/options.html"


def extract_carriers(page):
    with open(page, "r") as html:
        soup = BeautifulSoup(html)
        
        data = [options['value'] for options in soup.find(id='CarrierList') if options.name == 'option' and options['value'][:3] != 'All']
        
#        for options in soup.find(id='CarrierList'):
#            try:
#                if options.name == "option" and options['value'][:3] != "All":
#                    data.append(options['value'])
#            except TypeError as err:
#                print "ERROR:", err
#                pass

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': airport,
                          'CarrierList': carrier,
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data
    print "Passed."

test()

Passed.

Q2.2

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# All your changes should be in the 'extract_airports' function
# It should return a list of airport codes, excluding any combinations like "All"

from bs4 import BeautifulSoup
html_page = "data/options.html"



def extract_airports(page):
    with open(page, "r") as html:
        soup = BeautifulSoup(html)
        data = [airport['value'] for airport in soup.find(id='AirportList') if airport.name=='option' and airport['value'][:3] != 'All']

    return data

def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data
    print "Passed."

test()

Passed.

Q2.3

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Let's assume that you combined the code from the previous 2 exercises
# with code from the lesson on how to build requests, and downloaded all the data locally.
# The files are in a directory "data", named after the carrier and airport:
# "{}-{}.html".format(carrier, airport), for example "FL-ATL.html".
# The table with flight info has a table class="dataTDRight".
# There are couple of helper functions to deal with the data files.
# Please do not change them for grading purposes.
# All your changes should be in the 'process_file' function

from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

#datadir = "data"


#def open_zip(datadir):
#    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
#        myzip.extractall()


#def process_all(datadir):
#    files = os.listdir(datadir)
#    return files


def process_file(f):
    # This is example of the datastructure you should return
    # Each item in the list should be a dictionary containing all the relevant data
    # Note - year, month, and the flight data should be integers
    # You should skip the rows that contain the TOTAL data for a year
    # data = [{"courier": "FL",
    #         "airport": "ATL",
    #         "year": 2012,
    #         "month": 12,
    #         "flights": {"domestic": 100,
    #                     "international": 100}
    #         },
    #         {"courier": "..."}
    # ]

    # info = {}
    #
    #
    # info["courier"], info["airport"] = f[:6].split("-")

    #print info
    
    data = []
    with open("{}/{}".format(datadir, f), "r") as html:

        soup = BeautifulSoup(html)

        flight_table = soup.find("table", {"class": "dataTDRight"})

        for flight_data_row in flight_table.findAll('tr'):

            info = {}
            info["courier"], info["airport"] = f[:6].split("-")

            col = flight_data_row.findAll("td")
            year = col[0].string.strip()
            month = col[1].string.strip()
            domestic = col[2].string.strip()
            international = col[3].string.strip()
            skip_total = col[4].string.strip()

            try:

                year = int(str(year))
                month = int(str(month))
                domestic = int(str(domestic).replace(",", ""))
                international = int(str(international).replace(",", ""))

                info["year"] = year
                info["month"] = month
                info["flights"] = {"domestic": domestic,
                                   "international": international}
                data.append(info)
            except ValueError as err:
                #print "Tried converting a non-int type to int:", err
                pass
    return data

# Udacity accesses many external files; not available locally. 
# Passes grader checks with lines uncommented back in.
def test():
    print "Running a simple test..."
    #open_zip(datadir)
    #files = process_all(datadir)
    #data = []
    #for f in files:
    #    data += process_file(f)
    data = process_file('FL-ATL.html')
    
    #assert len(data) == 399
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[-1]["airport"] == "ATL"
    #assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"

if __name__ == "__main__":
    test()

Running a simple test...
... success!

Q2.4

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This and the following exercise are using US Patent database.
# The patent.data file is a small excerpt of a much larger datafile
# that is available for download from US Patent website. They are pretty large ( >100 MB each).
# The data itself is in XML, however there is a problem with how it's formatted.
# Please run this script and observe the error. Then find the line that is causing the error.
# You can do that by just looking at the datafile in the web UI, or programmatically.
# For quiz purposes it does not matter, but as an exercise we suggest that you try to do it programmatically.
# The original file is ~600MB large, you might not be able to open it in a text editor.

import xml.etree.ElementTree as ET

PATENTS = 'data/patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

get_root(PATENTS)

  File "<string>", line unknown
ParseError: junk after document element: line 657, column 0

import linecache

print linecache.getline('data/patent.data', 657)

<?xml version="1.0" encoding="UTF-8"?>

Q2.5a Please enter the content of the line that is causing the error:

Q2.5b What do you think is the problem?

Q2.6

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET
PATENTS = 'data/patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    # we want you to split the input file into separate files
    # each containing a single patent.
    # As a hint - each patent declaration starts with the same line that was causing the error
    # The new files should be saved with filename in the following format:
    # "{}-{}".format(filename, n) where n is a counter, starting from 0.

    output = []
    data = {}

    f = open(filename)
    count = 0
    file_number = 0

    # import pprint
    # pprint.pprint(f.readlines())


    output.append(f.readline())

    for line in f.readlines():

        if line.startswith("<?xml"):
            data["patent.data-{}".format(file_number)] = output



            root = ET.fromstringlist(output)
            # print ""
            # print root.tag
            # print root.attrib
            #
            # for child in root:
            #     print(child.tag, child.attrib)

            tree = ET.ElementTree(root)
            tree.write("data/patent.data-{}".format(file_number), encoding = 'UTF-8')
            output = []
            file_number += 1
        output.append(line)


    data["patent.data-{}".format(file_number)] = output
    root = ET.fromstringlist(output)
    tree = ET.ElementTree(root)
    tree.write("data/patent.data-{}".format(file_number), encoding = 'UTF-8')

    #import pprint
    #pprint.pprint(data)
    # return data
    pass


def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)
    print "Passed."


test()

Passed.

Lesson 3 - Correcting Validity

"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in range, as described above,
  write that line to the output_good file
- if the value of the field is not a valid year, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""
import csv
import pprint

INPUT_FILE = 'data/autos.csv'
OUTPUT_GOOD = 'data/autos-valid.csv'
OUTPUT_BAD = 'data/FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):

    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames

        good = []
        bad = []

        for row in reader:
            if row["URI"].startswith("http://dbpedia.org"):
                try:
                    year = row["productionStartYear"][:4]
                    if year == "NULL":
                        bad.append(row)
                    elif 1885 < int(year) < 2015:
                        row["productionStartYear"] = int(year)
                        good.append(row)

                    else:
                        bad.append(row)

                except ValueError as err:
                    bad.append(row)
                    # print "ERROR:", err
                    # print "YEAR", year
                    pass


    # This is just an example on how you can use csv.DictWriter
    # Remember that you have to output 2 files
    with open(output_good, "w") as g:
        writer = csv.DictWriter(g, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in good:
            writer.writerow(row)

    with open(output_bad, "w") as b:
        writer = csv.DictWriter(b, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in bad:
            writer.writerow(row)


def test():

    process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)


if __name__ == "__main__":
    test()

Q3.1

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then
clean it up. In the first exercise we want you to audit the datatypes that can be found in some 
particular fields in the dataset.
The possible types of values can be:
- 'NoneType' if the value is a string "NULL" or an empty string ""
- 'list', if the value starts with "{"
- 'int', if the value can be cast to int
- 'float', if the value can be cast to float, but is not an int
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a set of the datatypes
that can be found in the field.
All the data initially is a string, so you have to do some checks on the values first.

"""
import codecs
import csv
import json
import pprint

CITIES = 'data/cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label", "isPartOf_label", "areaCode", "populationTotal", 
          "elevation", "maximumElevation", "minimumElevation", "populationDensity", "wgs84_pos#lat", "wgs84_pos#long", 
          "areaLand", "areaMetro", "areaUrban"]

def is_number(check):
    try:
        float(check)
        return True
    except ValueError:
        return False

    
    
def audit_file(filename, fields):
    fieldtypes = {}

    with open(filename, "r") as file:
        c = csv.DictReader(file)
        
        for i in fields:
            fieldtypes[i] = set()  
            
        for row in c:
            if not row["URI"].startswith("http://dbpedia"):
                continue

                
            for row in c:
                for field in fields:
                    if row[field] == "NULL" or row[field] == "":
                        fieldtypes[field].add(type(None))
                    elif row[field].startswith("{"):
                        fieldtypes[field].add(type([]))
                    elif not is_number(row[field]):
                        fieldtypes[field].add(type("string"))
                    else:
                        if not "." in row[field]:
                            fieldtypes[field].add(type(1))
                        else:
                            fieldtypes[field].add(type(1.1))
        return fieldtypes



def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    #pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    print "Passed."
    
if __name__ == "__main__":
    test()

Passed.

Q3.2

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.

Since in the previous quiz you made a decision on which value to keep for the "areaLand" field,
you now know what has to be done.

Finish the function fix_area(). It will receive a string as an input, and it has to return a float
representing the value of the area or None.
You have to change the function fix_area. You can use extra functions if you like, but changes to process_file
will not be taken into account.
The rest of the code is just an example on how this function can be used.
"""
import codecs
import csv
import json
import pprint

CITIES = 'data/cities.csv'



def fix_area(area):

    if area == "NULL":
        return None
    elif area[0] == "{":
        l = area.strip("{}").split("|")
        # Get string without 0's
        l1, l2 = str(l[0]).replace("e+", "").replace("0", ""), str(l[1]).replace("e+", "").replace("0", "")
        # Compare length of non-zero "significant" digits
        # Then return original which has more as a float
        if len(l1) > len(l2):
            return float(l[0])
        else:
            return float(l[1])
    return float(area)


def process_file(filename):
    # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
    data = []

    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra matadata
        for i in range(3):
            l = reader.next()

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "areaLand" in line:
                line["areaLand"] = fix_area(line["areaLand"])
            data.append(line)

    return data


def test():
    data = process_file(CITIES)

    print "Printing three example results:"
    for n in range(5,8):
        pprint.pprint(data[n]["areaLand"])
    

    assert data[8]["areaLand"] == 55166700.0
    assert data[3]["areaLand"] == None


if __name__ == "__main__":
    test()

Printing three example results:
None
101787000.0
31597900.0

Q3.4

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.

In the previous quiz you recognized that the "name" value can be an array (or list in Python terms).
It would make it easier to process and query the data later, if all values for the name 
would be in a Python list, instead of being just a string separated with special characters, like now.

Finish the function fix_name(). It will recieve a string as an input, and it has to return a list
of all the names. If there is only one name, the list with have only one item in it, if the name is "NULL",
the list should be empty.
The rest of the code is just an example on how this function can be used
"""
import codecs
import csv
import pprint

CITIES = 'data/cities.csv'


def fix_name(name):

    if name == "NULL":
        return []
    if name.startswith("{"):
        return name.replace("{", "").replace("}", "").strip().split("|")
    else:
        return [name]


def process_file(filename):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra matadata
        for i in range(3):
            l = reader.next()
        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "name" in line:
                line["name"] = fix_name(line["name"])
            data.append(line)
    return data


def test():
    data = process_file(CITIES)

    print "Printing 20 results:"
    for n in range(20):
        pprint.pprint(data[n]["name"])

    assert data[14]["name"] == ['Negtemiut', 'Nightmute']
    assert data[3]["name"] == ['Kumhari']

if __name__ == "__main__":
    test()

Printing 20 results:
['Kud']
['Kuju']
['Kumbhraj']
['Kumhari']
['Kunigal']
['Kurgunta']
['Athens']
['Demopolis']
['Chelsea Alabama']
['Pell City Alabama']
['City of Northport']
['Sand Point']
['Unalaska Alaska']
['City of Menlo Park']
['Negtemiut', 'Nightmute']
['Fairbanks Alaska']
['Homer']
['Ketchikan Alaska']
['Nuniaq', 'Old Harbor']
['Rainier Washington']

Q3.6

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.

If you look at the full city data, you will notice that there are couple of values that seem to provide
the same information in different formats: "point" seems to be the combination of "wgs84_pos#lat" and "wgs84_pos#long".
However we do not know if that is the case and should check if they are equivalent.

Finish the function check_loc(). It will recieve 3 strings, first will be the combined value of "point" and then the
"wgs84_pos#" values separately. You have to extract the lat and long values from the "point" and compare
to the "wgs84_pos# values and return True or False.

Note that you do not have to fix the values, just determine if they are consistent. To fix them in this case
you would need more information. Feel free to discuss possible strategies for fixing this on the discussion forum.

The rest of the code is just an example on how this function can be used.
Changes to "process_file" function will not be take into account.
"""
import csv
import pprint

CITIES = 'data/cities.csv'


def check_loc(point, lat, longi):

    check1, check2 = point.split(" ")
    if check1 == lat and check2 == longi:
        return True
    else:
        return False


def process_file(filename):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra matadata
        for i in range(3):
            l = reader.next()
        # processing file
        for line in reader:
            # calling your function to check the location
            result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])
            if not result:
                print "{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])
            data.append(line)

    return data


def test():
    assert check_loc("33.08 75.28", "33.08", "75.28") == True
    assert check_loc("44.57833333333333 -91.21833333333333", "44.5783", "-91.2183") == False
    print "Passed."

if __name__ == "__main__":
    test()

Passed.

Kicking the tires on MongoDB

"""
Your task is to sucessfully run the exercise to see how pymongo works
and how easy it is to start using it.
You don't actually have to change anything in this exercise,
but you can change the city name in the add_city function if you like.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB (see Instructor comments for link to installation information)
and uncomment the get_db function.
"""


def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    # 'examples' here is the database name. It will be created if it does not exist.
    db = client.examples
    return db


def add_city(db):
    db.cities.insert({"name" : "Chicago"})
    
def get_city(db):
    return db.cities.find_one()


if __name__ == "__main__":

    db = get_db() # uncomment this line if you want to run this locally
    add_city(db)
    print get_city(db)

{u'_id': ObjectId('53b27aeecf9ef83870ffda22'), u'name': u'Chicago'}

Finding Porsche

#!/usr/bin/env python
"""
Your task is to complete the 'porsche_query' function and in particular the query
to find all autos where the manufacturer field matches "Porsche".
Please modify only 'porsche_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB and download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials at
the following link:
https://www.udacity.com/wiki/ud032
"""


def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db


def porsche_query():
    
    query = {"manufacturer": "Porsche"}
    return query


def find_porsche(db, query):
    return db.autos.find(query)


if __name__ == "__main__":

    db = get_db('examples')
    query = porsche_query()
    p = find_porsche(db, query)
    import pprint
    # Print only the first record.
    for a in p[0:1]:
        pprint.pprint(a)

{u'_id': ObjectId('53b3629acf9ef81fe4999d69'),
 u'assembly': [u'Germany', u'Stuttgart'],
 u'bodyStyle': u'coup\xe9',
 u'class': u'grand tourer',
 u'dimensions': {u'height': 1.27508,
                 u'length': 4.52,
                 u'weight': 1450.0,
                 u'wheelbase': 2.5,
                 u'width': 1.89},
 u'engine': [u'Porsche_928__1',
             u'Porsche_928__2',
             u'Porsche_928__3',
             u'Porsche_928__4'],
 u'layout': u'front-engine rear-wheel-drive layout',
 u'manufacturer': u'Porsche',
 u'modelYears': [],
 u'name': u'Porsche 928',
 u'productionYears': [1977,
                      1978,
                      1979,
                      1980,
                      1981,
                      1982,
                      1983,
                      1984,
                      1985,
                      1986,
                      1987,
                      1988,
                      1989,
                      1990,
                      1991,
                      1992,
                      1993,
                      1994,
                      1995],
 u'transmission': [u'3-speed automatic',
                   u'4-speed automatic',
                   u'5-speed manual']}

Inserting Multiple Documents

from pymongo import MongoClient
import csv
import json
import io
import re
import pprint


field_map = {
    "name" : "name",
    "bodyStyle_label" : "bodyStyle",
    "assembly_label" : "assembly",
    "class_label" : "class",
    "designer_label" : "designer",
    "engine_label" : "engine",
    "length" : "length",
    "height" : "height",
    "width" : "width",
    "weight" : "weight",
    "wheelbase" : "wheelbase",
    "layout_label" : "layout",
    "manufacturer_label" : "manufacturer",
    "modelEndYear" : "modelEndYear",
    "modelStartYear" : "modelStartYear",
    "predecessorLabel" : "predecessorLabel",
    "productionStartYear" : "productionStartYear",
    "productionEndYear" : "productionEndYear",
    "transmission" : "transmission"
}
fields = field_map.keys()


def skip_lines(input_file, skip):
    for i in range(0, skip):
        next(input_file)

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def strip_automobile(v):
    return re.sub(r"\s*\(automobile\)\s*", " ", v)

def strip_city(v):
    return re.sub(r"\s*\(city\)\s*", " ", v)

def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return v

def mm_to_meters(v):
    if v < 0.01:
        return v * 1000
    return v

def clean_dimension(d, field, v):
    if is_number(v):
        if field == "weight":
            d[field] = float(v) / 1000.0
        else:
            d[field] = mm_to_meters(float(v))
    
def clean_year(d, field, v):
    d[field] = v[0:4]

def parse_array2(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return (True, v_array)
    return (False, v)

def ensure_not_array(v):
    (is_array, v) = parse_array(v)
    if is_array:
        return v[0]
    return v

def ensure_array(v):
    (is_array, v) = parse_array2(v)
    if is_array:
        return v
    return [v]

def ensure_float(v):
    if is_number(v):
        return float(v)

def ensure_int(v):
    if is_number(v):
        return int(v)

def ensure_year_array(val):
    #print "val:", val
    vals = ensure_array(val)
    year_vals = []
    for v in vals:
        v = v[0:4]
        v = int(v)
        if v:
            year_vals.append(v)
    return year_vals

def empty_val(val):
    val = val.strip()
    return (val == "NULL") or (val == "")

def years(row, start_field, end_field):
    start_val = row[start_field]
    end_val = row[end_field]

    if empty_val(start_val) or empty_val(end_val):
        return []

    start_years = ensure_year_array(start_val)
    if start_years:
        start_years = sorted(start_years)
    end_years = ensure_year_array(end_val)
    if end_years:
        end_years = sorted(end_years)
    all_years = []
    if start_years and end_years:
        #print start_years
        #print end_years
        for i in range(0, min(len(start_years), len(end_years))):
            for y in range(start_years[i], end_years[i]+1):
                all_years.append(y)
    return all_years


def process_file(input_file):
    input_data = csv.DictReader(open(input_file))
    autos = []
    skip_lines(input_data, 3)
    for row in input_data:
        auto = {}
        model_years = {}
        production_years = {}
        dimensions = {}
        for field, val in row.iteritems():
            if field not in fields or empty_val(val):
                continue
            if field in ["bodyStyle_label", "class_label", "layout_label"]:
                val = val.lower()
            val = strip_automobile(val)
            val = strip_city(val)
            val = val.strip()
            val = parse_array(val)
            if field in ["length", "width", "height", "weight", "wheelbase"]:
                clean_dimension(dimensions, field_map[field], val)
            elif field in ["modelStartYear", "modelEndYear"]:
                clean_year(model_years, field_map[field], val)
            elif field in ["productionStartYear", "productionEndYear"]:
                clean_year(production_years, field_map[field], val)
            else:
                auto[field_map[field]] = val
        if dimensions:
            auto['dimensions'] = dimensions
        auto['modelYears'] = years(row, 'modelStartYear', 'modelEndYear')
        auto['productionYears'] = years(row, 'productionStartYear', 'productionEndYear')
        autos.append(auto)
    return autos

# -------------------------------------------------------------------------------------------------------------

def insert_autos(infile, db):
    autos = process_file(infile)
    
    for a in autos:
        db.autos.insert(a)
    # Your code here. Insert the data in one command
    # autos will be a list of dictionaries, as in the example in the previous video
    # You have to insert data in a collection 'autos'


  
if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    insert_autos('data/autos-small.csv', db)
    pprint.pprint(db.autos.find_one())

{u'_id': ObjectId('53b3629acf9ef81fe4999d66'),
 u'dimensions': {u'length': 39.9288, u'weight': 2721000.0, u'width': 34.7472},
 u'engine': u'Crawler-transporter__1',
 u'manufacturer': u'Marion Power Shovel Company',
 u'modelYears': [],
 u'name': u'Crawler-transporter',
 u'productionYears': [],
 u'transmission': u'16 traction motors powered by four  generators'}

Range Queries

#!/usr/bin/env python
""" Your task is to write a query that will return all cities
that are founded in 21st century.
Please modify only 'range_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""
from datetime import datetime
    
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def range_query():
    # You can use datetime(year, month, day) to specify date in the query
    query = {"foundingDate": {"$gte": datetime(2001, 1, 1), "$lt": datetime(2100, 1, 1)}}
    return query


if __name__ == "__main__":

    db = get_db()
    query = range_query()
    cities = db.cities.find(query)

    print "Found cities:", cities.count()
    import pprint
    pprint.pprint(cities[0])

Using $in Operator

#!/usr/bin/env python
""" Your task is to write a query that will return all cars manufactured by "Ford Motor Company"
that are assembled in Germany, United Kingdom, or Japan.
Please modify only 'in_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""

def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def in_query():
    # Write the query
    query = {"manufacturer": "Ford Motor Company", "assembly":{"$in": ["Germany", "United Kingdom", "Japan"]} }
    
    return query


if __name__ == "__main__":

    db = get_db()
    query = in_query()
    autos = db.autos.find(query, {"name":1, "manufacturer":1, "assembly": 1, "_id":0})

    print "Found autos:", autos.count()
    import pprint
    # Print first record only
    for a in autos[0:1]:
        pprint.pprint(a)

Found autos: 34
{u'assembly': [u'Argentina',
               u'Australia',
               u'Berlin',
               u'Brazil',
               u'Buenos Aires',
               u'Canada',
               u'Copenhagen',
               u'Cork',
               u'Denmark',
               u'Detroit',
               u'Dothan Alabama',
               u'England',
               u'Geelong',
               u'Germany',
               u'Highland Park Michigan',
               u'Ireland',
               u'Manchester',
               u'Minneapolis',
               u'Ontario',
               u'S\xe3o Bernardo do Campo',
               u'Saint Paul Minnesota',
               u'Toronto',
               u'Walkerville Ontario'],
 u'manufacturer': u'Ford Motor Company',
 u'name': u'Ford Model T'}

Dot Notation

#!/usr/bin/env python
""" Your task is to write a query that will return all cars with width dimension greater than 2.5
Please modify only 'dot_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""


def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


def dot_query():
    query = {"dimensions.width":{"$gt":2.5}}
    return query


if __name__ == "__main__":

    db = get_db()
    query = dot_query()
    cars = db.autos.find(query, {"dimensions.width":1, "_id":0, "name":1})

    print "Found cars:", autos.count()
    import pprint
    for car in cars[0:5]:
        pprint.pprint(car)

Found cars: 34
{u'dimensions': {u'width': 34.7472}, u'name': u'Crawler-transporter'}
{u'dimensions': {u'width': 3.7}, u'name': u'Thrust SSC'}
{u'dimensions': {u'width': 8.7}, u'name': u'Liebherr T 282B'}
{u'dimensions': {u'width': 2.5908}, u'name': u'Nova Bus LFS'}
{u'dimensions': {u'width': 17.2212}, u'name': u'Wolseley 6/90'}

Q4.1

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it, 
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.
Your task in this exercise is to parse the file, process only the fields that are listed in the
FIELDS dictionary as keys, and return a dictionary of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label' field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the cleanup is up to you,
  eg removing "*" prefixes etc
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:
{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'data/arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}


def clean_array(temp_array):
    final_array = []
    remove_these = ["Pocock", "Forster", "Couzijn", "Thorell", "Peckham"]
    for strings in temp_array:
        temp_string = strings.replace("*", "").strip()
        for r in remove_these:
            if r in strings:
                temp_string = temp_string.split(r)[0].strip()
            if temp_string[-1:] == "(":
                temp_string = temp_string[:-1].strip()
        final_array.append(temp_string.strip())
    return final_array


def process_file(filename, fields):
    process_fields = fields.keys()

    data = []

    class_fields = ["class", "family", "genus", "kingdom", "order", "phylum"]

    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()
        for line in reader:

              # Create new dictionary structure containing only the fields that are passed in.
            temp_dict = {}
            class_dict = {}
            for old_key in process_fields:
                new_key = fields[old_key]
                if new_key in ["label", "uri", "description", "name", "synonym"]:
                    temp_dict[new_key] = line[old_key].strip()
                else:
                    class_dict[new_key] = line[old_key].strip()
                    if class_dict[new_key] == "NULL":
                        class_dict[new_key] = None

            temp_dict["classification"] = class_dict

            for new_field in temp_dict.keys():

                  # Remove (extra names) from labels
                if new_field == "label":
                    temp_dict["label"] = temp_dict["label"].split("(")[0].strip()

                  # Check for non-alphanumeric chars, if found, replace "name" with "label"
                if new_field == "name":
                    if re.search('[A-Za-z0-9]*', temp_dict[new_field]).group() != temp_dict[new_field]:
                        temp_dict[new_field] = temp_dict["label"].strip()

                  # Change all NULL entries to None, except in "name" where NULL is changed to "label" entry.
                if temp_dict[new_field] == "NULL":
                    if new_field == "name":
                        temp_dict[new_field] = temp_dict["label"].strip()
                    else:
                        temp_dict[new_field] = None
                  # Split synonyms into list of synonyms. Pass to clean_array() for further cleaning.
                if new_field == "synonym" and temp_dict["synonym"] is not None:
                    temp_array = parse_array(temp_dict["synonym"])
                    temp_dict["synonym"] = clean_array(temp_array)

            data.append(temp_dict)

    return data


def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]


def test():
    data = process_file(DATAFILE, FIELDS)

    pprint.pprint(data[0])
    assert data[0] == {
                        "synonym": None, 
                        "name": "Argiope", 
                        "classification": {
                            "kingdom": "Animal", 
                            "family": "Orb-weaver spider", 
                            "order": "Spider", 
                            "phylum": "Arthropod", 
                            "genus": None, 
                            "class": "Arachnid"
                        }, 
                        "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
                        "label": "Argiope", 
                        "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
                    }


if __name__ == "__main__":
    test()

{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}

Q4.2

import json
import pprint

def insert_data(data, db):

    for a in data:
        db.arachnid.insert(a)


if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open('data/arachnid.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
        pprint.pprint(db.arachnid.find_one())

{u'_id': ObjectId('53b3ef08cf9ef82c701090b5'),
 u'classification': {u'class': u'Arachnid',
                     u'family': u'Orb-weaver spider',
                     u'genus': None,
                     u'kingdom': u'Animal',
                     u'order': u'Spider',
                     u'phylum': u'Arthropod'},
 u'description': u'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 u'label': u'Argiope',
 u'name': u'Argiope',
 u'synonym': None,
 u'uri': u'http://dbpedia.org/resource/Argiope_(spider)'}

Q4.3

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it, 
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.

The data is already in the database. But you have been given a task to also include 'binomialAuthority'
information in the data, so you have to go through the data and update the existing entries.

The following things should be done in the function add_field:
- process the csv file and extract 2 fields - 'rdf-schema#label' and 'binomialAuthority_label'
- clean up the 'rdf-schema#label' same way as in the first exercise - removing redundant "(spider)" suffixes
- return a dictionary, with 'label' being the key, and 'binomialAuthority_label' the value
- if 'binomialAuthority_label' is "NULL", skip the item

The following should be done in the function update_db:
- query the database by using the field 'label'
- update the data, by adding a new item under 'classification' with a key 'binomialAuthority'


The resulting data should look like this:
- the output structure should be as follows:
{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'binomialAuthority': None,
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}
"""
import codecs
import csv
import json
import pprint

DATAFILE = 'data/arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'binomialAuthority_label': 'binomialAuthority'}


def add_field(filename, fields):

    process_fields = fields.keys()
    data = {}
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()

        for line in reader:
                if line["binomialAuthority_label"] != "NULL":
                    data[line["rdf-schema#label"].split("(")[0].strip()] = line["binomialAuthority_label"]

    return data


def update_db(data, db):

    for k in data.keys():

        #all_r = db.arachnid.find({"label": k})
        #for a in all_r:
        #    pprint.pprint(a), "P"

        db.arachnid.update({"label": k}, {"$set": {"classification.binomialAuthority": data[k]}}, multi=True)


def test():
    # Please change only the add_field and update_db functions!
    # Changes done to this function will not be taken into account
    # when doing a Test Run or Submit, they are just for your own reference
    # and as an example for running this code locally!
    
    data = add_field(DATAFILE, FIELDS)
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    update_db(data, db)

    updated = db.arachnid.find_one({'label': 'Opisthoncana'})
    assert updated['classification']['binomialAuthority'] == 'Embrik Strand'
    pprint.pprint(data)



if __name__ == "__main__":
    test()

{'Opisthoncana': 'Embrik Strand',
 'Orvilleus': 'Arthur M. Chickering',
 'Six-spotted fishing spider': 'Charles Athanase Walckenaer',
 'Zealanapis australis': '{1951 in science|Raymond Robert Forster}'}

Using Group

import json 
import pprint

def insert_data(data, db):
    for a in data:
        db.twitter.insert(a)

if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples
    
    # Available here: http://content.udacity-data.com/ud032/twitter/twitter.json.zip
    with open('data/twitter.json', 'r') as f:
        ## json.loads() takes a string, while json.load() takes a file-like object.
        ## http://stackoverflow.com/questions/11568246/loading-several-text-files-into-mongodb-using-pymongo
        for tweet in f.readlines():
            db.twitter.insert(json.loads(tweet))
    pprint.pprint(db.twitter.find_one())

{u'_id': ObjectId('54c81ed3cf9ef82b0871a383'),
 u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Thu Sep 02 18:11:23 +0000 2010',
 u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []},
 u'favorited': False,
 u'geo': None,
 u'id': 22819396900L,
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_user_id': None,
 u'place': None,
 u'retweet_count': None,
 u'retweeted': False,
 u'source': u'web',
 u'text': u'eu preciso de terminar de fazer a minha tabela, est\xe1 muito foda **',
 u'truncated': False,
 u'user': {u'contributors_enabled': False,
           u'created_at': u'Fri Jul 03 21:44:05 +0000 2009',
           u'description': u's\xf3 os loucos sabem (:',
           u'favourites_count': 1,
           u'follow_request_sent': None,
           u'followers_count': 102,
           u'following': None,
           u'friends_count': 73,
           u'geo_enabled': False,
           u'id': 53507833,
           u'lang': u'en',
           u'listed_count': 0,
           u'location': u'',
           u'name': u'Beatriz Helena Cunha',
           u'notifications': None,
           u'profile_background_color': u'081114',
           u'profile_background_image_url': u'http://a1.twimg.com/profile_background_images/133178546/biatwitter.jpg',
           u'profile_background_tile': True,
           u'profile_image_url': u'http://a2.twimg.com/profile_images/1036412454/OgAAADXK9q6kaxrvfwQTINH66RVLAH9YHb-veRTA4FaWb9KtbGGV_yKTGzmvzTfJidqAb5gK_mpspIE-MIvAASGH2CwAm1T1UIPQk0-HS8x_TV5kdnW30nch7ODk-1_normal.jpg',
           u'profile_link_color': u'eb55b6',
           u'profile_sidebar_border_color': u'1c9dbd',
           u'profile_sidebar_fill_color': u'768575',
           u'profile_text_color': u'25b8c2',
           u'profile_use_background_image': True,
           u'protected': False,
           u'screen_name': u'Bia_cunha1',
           u'show_all_inline_media': False,
           u'statuses_count': 3504,
           u'time_zone': u'Brasilia',
           u'url': u'http://http://www.orkut.com.br/Main#Profile?uid=1433295880233078770',
           u'utc_offset': -10800,
           u'verified': False}}

#!/usr/bin/env python
"""
The tweets in our twitter collection have a field called "source". This field describes the application
that was used to create the tweet. Following the examples for using the $group operator, your task is 
to modify the 'make-pipeline' function to identify most used applications for creating tweets. 
As a check on your query, 'web' is listed as the most frequently used application.
'Ubertwitter' is the second most used. 

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline
that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation 
pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. 
If you want to run this code locally on your machine, you have to install MongoDB, 
download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset 
used in examples in this lesson. 
If you attempt some of the same queries that we looked at in the lesson examples,
your results will be different.
"""


def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

def make_pipeline():
    pipeline = [{"$group": {"_id": "$source",
                            "count": {"$sum": 1}}},
                 {"$sort": {"count": -1}},
                 {"$limit" : 5 }]
    return pipeline

def tweet_sources(db, pipeline):
    result = db.tweets.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = tweet_sources(db, pipeline)
    import pprint
    pprint.pprint(result)

{u'ok': 1.0,
 u'result': [{u'_id': u'web', u'count': 69410},
             {u'_id': u'<a href="http://www.ubertwitter.com/bb/download.php" rel="nofollow">\xdcberTwitter</a>',
              u'count': 10179},
             {u'_id': u'<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>',
              u'count': 10110},
             {u'_id': u'<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\xae</a>',
              u'count': 6747},
             {u'_id': u'<a href="http://twitter.com/" rel="nofollow">Twitter for iPhone</a>',
              u'count': 6027}]}

Using match and project

#!/usr/bin/env python
"""
Write an aggregation query to answer this question:

Of the users in the "Brasilia" timezone who have tweeted 100 times or more,
who has the largest number of followers?

The following hints will help you solve this problem:
- Time zone is found in the "time_zone" field of the user object in each tweet.
- The number of tweets for each user is found in the "statuses_count" field.
  To access these fields you will need to use dot notation (from Lesson 4)
- Your aggregation query should return something like the following:
{u'ok': 1.0,
 u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
                  u'followers': 2597,
                  u'screen_name': u'marbles',
                  u'tweets': 12334}]}

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used 
in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$match": {"user.time_zone": "Brasilia",
                            "user.statuses_count": {"$gte": 100}}},
                {"$project": {"followers": "$user.followers_count",
                              "screen_name": "$user.screen_name",
                              "tweets": "$user.statuses_count"}},
                {"$sort": {"followers": -1}},
                {"$limit": 1}]
    return pipeline

def aggregate(db, pipeline):
    result = db.tweets.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result)
    assert len(result["result"]) == 1
    # Online quiz uses smaller dataset. 
    # Full dataset is loaded here giving slightly different results for aggregations.
    # assert result["result"][0]["followers"] == 17209

{u'ok': 1.0,
 u'result': [{u'_id': ObjectId('54c81ddbcf9ef82b086f7611'),
              u'followers': 259760,
              u'screen_name': u'otaviomesquita',
              u'tweets': 10997}]}

Using Unwind

#!/usr/bin/env python
"""
For this exercise, let's return to our cities infobox dataset. The question we would like you to answer
is as follows:  Which region in India contains the most cities?

As a starting point, use the solution for the example question we looked at -- "Who includes the most
user mentions in their tweets?"

One thing to note about the cities data is that the "isPartOf" field contains an array of regions or 
districts in which a given city is found. See the example document in Instructor Comments below.

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline 
that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation 
pipeline should be a list of one or more dictionary objects. Please review the lesson examples if you 
are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$unwind": "$isPartOf"},
                {"$match": {"country": "India"}},
                {"$group": {"_id": "$isPartOf",
                            "count": {"$sum": 1}}},
                {"$sort": {"count": -1}}]
    return pipeline

def aggregate(db, pipeline):
    result = db.cities.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    print "Printing the first result:"
    import pprint
    pprint.pprint(result["result"][0])
    assert result["result"][0]["_id"] == "Uttar Pradesh"
    assert result["result"][0]["count"] == 623

Using Push

#!/usr/bin/env python
"""
$push is similar to $addToSet. The difference is that rather than accumulating only unique values 
it aggregates all values into an array.

Using an aggregation query, count the number of tweets for each user. In the same $group stage, 
use $push to accumulate all the tweet texts for each user. Limit your output to the 5 users
with the most tweets. 
Your result documents should include only the fields:
"_id" (screen name of user), 
"count" (number of tweets found for the user),
"tweet_texts" (a list of the tweet texts found for the user).  

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$group": {"_id": "$user.screen_name",
                            "tweet_texts": {"$push": "$text"},
                            "count": {"$sum": 1}}},
                {"$sort": {"count": -1}},
                {"$limit": 5}]
    return pipeline

def aggregate(db, pipeline):
    result = db.tweets.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    assert len(result["result"]) == 5
    assert result["result"][0]["count"] > result["result"][4]["count"]
    import pprint
    pprint.pprint(result['result'][0])

{u'_id': u'behcolin',
 u'count': 24,
 u'tweet_texts': [u'RT @VouConfessarQue: #VouConfessarQue j\xe1 aprendi uma mat\xe9ria inteira poucos minutos antes de uma prova.',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'volto jah',
                  u'RT @VouConfessarQue: #VouConfessarQue j\xe1 aprendi uma mat\xe9ria inteira poucos minutos antes de uma prova.',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'volto jah',
                  u'RT @VouConfessarQue: #VouConfessarQue j\xe1 aprendi uma mat\xe9ria inteira poucos minutos antes de uma prova.',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'RT @TweetGargalhada: Geisy Arruda nos TT? Achei que tinha finalmente sumido, mas pelo jeito a mar\xe9 trouxe ela de volta!',
                  u'volto jah']}

Same Operator P

#!/usr/bin/env python
"""
In an earlier exercise we looked at the cities dataset and asked which region in India contains 
the most cities. In this exercise, we'd like you to answer a related question regarding regions in 
India. What is the average city population for a region in India? Calculate your answer by first 
finding the average population of cities in each region and then by calculating the average of the 
regional averages.

Hint: If you want to accumulate using values from all input documents to a group stage, you may use 
a constant as the value of the "_id" field. For example, 
    { "$group" : {"_id" : "India Regional City Population Average",
      ... }

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used 
in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$match": {"country": "India"}},
                # First, match India as the country of interest; data contains world data.
                {"$unwind": "$isPartOf"},
                # Unwind regions; some cities belong to multiple regions.
                {"$group": {"_id": "$isPartOf",
                            # Now group on each region.
                            "totPop": {"$sum": "$population"},
                            # Sum up the population of all of the cities for each region.
                            "count": {"$sum": 1},
                            # Count the number of times each region shows up.
                 "average": {"$avg": "$population"}}},
                # Create an average for each region.
                {"$group": {"_id": "India Regional City Population Average",
                # Now group by a constant to group everything together.
                 "avg": {"$avg": "$average"}}}]
                # And finally, get an average of the average region populations.
    return pipeline

def aggregate(db, pipeline):
    result = db.cities.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    assert len(result["result"]) == 1
    assert result["result"][0]["avg"] == 196025.97814809752
    import pprint
    pprint.pprint(result)

Q5.1

#!/usr/bin/env python
"""
Use an aggregation query to answer the following question. 

What is the most common city name in our cities collection?

Your first attempt probably identified None as the most frequently occurring city name. 
What that actually means is that there are a number of cities without a name field at all. 
It's strange that such documents would exist in this collection and, depending on your situation, 
might actually warrant further cleaning. 

To solve this problem the right way, we should really ignore cities that don't have a name specified. 
As a hint ask yourself what pipeline operator allows us to simply filter input? 
How do we test for the existence of a field?

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline 
that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. 
If you want to run this code locally on your machine, you have to install MongoDB, 
download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$match": {"name": {"$exists": True}}},
                {"$group": {"_id": "$name",
                            "count": {"$sum": 1}}},
                {"$sort": {"count": -1}},
                {"$limit": 1}]
    return pipeline

def aggregate(db, pipeline):
    result = db.cities.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result["result"][0])
    assert len(result["result"]) == 1
    assert result["result"][0] == {'_id': 'Shahpur', 'count': 6}

Q5.2

#!/usr/bin/env python
"""
Use an aggregation query to answer the following question. 

Which Region in India has the largest number of cities with longitude between 75 and 80?

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this 
code locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$match": {"country": "India",
                            "lon": {"$gte": 75},
                            "lon": {"$lte": 80}}},
                {"$unwind": "$isPartOf"},
                {"$group": {"_id": "$isPartOf",
                            "count": {"$sum": 1}}},
                {"$sort": {"count": -1}},
                {"$limit": 1}]
    return pipeline

def aggregate(db, pipeline):
    result = db.cities.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result["result"][0])
    assert len(result["result"]) == 1
    assert result["result"][0]["_id"] == 'Tamil Nadu'

Q5.3

#!/usr/bin/env python
"""
Use an aggregation query to answer the following question. 

Extrapolating from an earlier exercise in this lesson, find the average regional city population 
for all countries in the cities collection. What we are asking here is that you first calculate the 
average city population for each region in a country and then calculate the average of all the 
regional averages for a country. As a hint, _id fields in group stages need not be single values. 
They can also be compound keys (documents composed of multiple fields). You will use the same 
aggregation operator in more than one stage in writing this aggregation query. I encourage you to 
write it one stage at a time and test after writing each stage.

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$unwind": "$isPartOf"},
                {"$group": {"_id": {"Country": "$country",
                                    "Region": "$isPartOf"},
                            "avgCity": {"$avg": "$population"}}},
                {"$group": {"_id": "$_id.Country",
                            "avgRegionalPopulation": {"$avg": "$avgCity"}}}]
    return pipeline

def aggregate(db, pipeline):
    result = db.cities.aggregate(pipeline)
    return result

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    if len(result["result"]) < 150:
        pprint.pprint(result["result"])
    else:
        pprint.pprint(result["result"][:100])
    for country in result["result"]:
        if country["_id"] == 'Kuwait':
            assert country == {'_id': 'Kuwait', 'avgRegionalPopulation': 115945.66666666667}

Iterative Parsing

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
The output should be a dictionary with the tag name as the key
and number of times this tag can be encountered in the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.ElementTree as ET
import pprint
from collections import defaultdict

def count_tags(filename):
#         counts = dict()
#         for line in ET.iterparse(filename):
#             current = line[1].tag
#             counts[current] = counts.get(current, 0) + 1
    counts = defaultdict(int)
    for line in ET.iterparse(filename):
        current = line[1].tag
        counts[current] += 1
    return counts



def test():

    tags = count_tags('data/example.osm')
    pprint.pprint(tags)
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    

if __name__ == "__main__":
    test()

defaultdict(<type 'int'>, {'node': 20, 'nd': 4, 'bounds': 1, 'member': 3, 'tag': 7, 'relation': 1, 'way': 1, 'osm': 1})

Tag Types

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into MongoDB, you should
check the "k" value for each "<tag>" and see if they can be valid keys in MongoDB,
as well as see if there are any other potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data model
and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with problematic characters.
Please complete the function 'key_type'.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertions will be incorrect then.
    keys = process_map('data/example.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

{'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}

Exploring Users

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for not_used, element in ET.iterparse(filename):
        #print "TAG:", element.tag
        #pprint.pprint(element.attrib)
        if element.tag == "node" or element.tag == "way" or element.tag == "relation":
            users.add(element.attrib['uid'])
            #pprint.pprint(element.attrib['uid'])

    return users


def test():

    users = process_map('data/example.osm')
    pprint.pprint(users)
    assert len(users) == 6



if __name__ == "__main__":
    test()

set(['1219059', '147510', '26299', '451048', '567034', '939355'])

Improving Street Names

"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "data/example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            "W.": "West",
            "N.": "North",
            "S.": "South",
            "E": "East"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    after = []
    # Split name string to test each part of the name;
    # Replacements may come anywhere in the name.
    for part in name.split(" "):
        # Check each part of the name against the keys in the correction dict
        if part in mapping.keys():
            # If exists in dict, overwrite that part of the name with the dict value for it.
            part = mapping[part]
        # Assemble each corrected piece of the name back together.
        after.append(part)
    # Return all pieces of the name as a string joined by a space.
    return " ".join(after)
    

#     for w in mapping.keys():
#         if w in name:
#             if flag:
#                 continue
#             # Replace abbrev. name in string with full name value from the mapping dict.
#             name = name.replace(w, mapping[w], 1)
#             # If St., flag to not check again in this string looking for St since new 'Street' will contain St
#             # re.compile() might be better
#             if w == "St.":
#                 flag = True

    return name


def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

{'Ave': set(['N. Lincoln Ave', 'North Lincoln Ave']),
 'Rd.': set(['Baldwin Rd.']),
 'St.': set(['West Lexington St.'])}
N. Lincoln Ave => North Lincoln Avenue
North Lincoln Ave => North Lincoln Avenue
West Lexington St. => West Lexington Street
Baldwin Rd. => Baldwin Road

Preparing for database

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning
before doing that, like in the previous exercise, but for this exercise you just have to
shape the structure.

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_ref": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True




def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way":
        address_info = {}
        nd_info = []
        #pprint.pprint(element.attrib)
        node["type"] = element.tag
        node["id"] = element.attrib["id"]
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                #print "PROBLEM:", p.group()
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    #print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    continue
                else:
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        if address_info != {}:
            node['address'] = address_info
        for tag2 in element.iter("nd"):
            nd_info.append(tag2.attrib['ref'])
            #print tag2.attrib['ref']
        if nd_info != []:
            node['node_refs'] = nd_info
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():

    data = process_map('data/example.osm', False)
    #pprint.pprint(data)
    assert data[0] == {
                        "id": "261114295", 
                        "visible": "true", 
                        "type": "node", 
                        "pos": [
                          41.9730791, 
                          -87.6866303
                        ], 
                        "created": {
                          "changeset": "11129782", 
                          "user": "bbmiller", 
                          "version": "7", 
                          "uid": "451048", 
                          "timestamp": "2012-03-28T18:31:23Z"
                        }
                      }
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]
    print "Passed."

if __name__ == "__main__":
    test()

Passed.