In [7]:
import os
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
import bson
import pymongo

In [32]:
# Data used - Weekly OSM Metro Extracts from:
# https://mapzen.com/metro-extracts/
# https://s3.amazonaws.com/metro-extracts.mapzen.com/albuquerque_new-mexico.osm.bz2
# https://s3.amazonaws.com/metro-extracts.mapzen.com/honolulu_hawaii.osm.bz2

DATADIR = "data"
DATAFILE = "honolulu_hawaii.osm"
DATAFILE2 = "albuquerque_new-mexico.osm"

HNL_DATA = os.path.join(DATADIR, DATAFILE)
ABQ_DATA = os.path.join(DATADIR, DATAFILE2)
In [3]:
def count_tags(filename):
    counts = collections.defaultdict(int)
    for line in ET.iterparse(filename, events=("start",)):
        current = line[1].tag
        counts[current] += 1
    return counts

hnl_tags = count_tags(HNL_DATA)
abq_tags = count_tags(ABQ_DATA)
pprint.pprint(hnl_tags)
pprint.pprint(abq_tags)
defaultdict(<type 'int'>, {'node': 206288, 'nd': 245614, 'bounds': 1, 'member': 1160, 'tag': 104987, 'osm': 1, 'way': 21464, 'relation': 268})
defaultdict(<type 'int'>, {'node': 226327, 'nd': 283147, 'bounds': 1, 'member': 1398, 'tag': 208869, 'osm': 1, 'way': 35195, 'relation': 117})


In [4]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

hnl_all_keys = process_map(HNL_DATA)
abq_all_keys = process_map(ABQ_DATA)
print hnl_all_keys
print abq_all_keys
{'problemchars': 0, 'lower': 52301, 'other': 2350, 'lower_colon': 50336}
{'problemchars': 7, 'lower': 90927, 'other': 2224, 'lower_colon': 115711}


This shows many keys are present with semicolons such as 'addr:postcode'

In [5]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "pl": "Place",
            "st": "Street",
            "ave": "Avenue",
            "rd": "Road",
            "w": "West",
            "n": "North",
            "s": "South",
            "e": "East",
            "blvd":"Boulevard",
            "sr": "Drive",
            "ct": "Court",
            "ne": "Northeast",
            "se": "Southeast",
            "nw": "Northwest",
            "sw": "Southwest",
            "dr": "Drive",
            "sq": "Square",
            "ln": "Lane",
            "trl": "Trail",
            "pkwy": "Parkway",
            "ste": "Suite",
            "lp": "Loop",
            "hwy": "Highway"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    after = []
    # Split name string to test each part of the name;
    # Replacements may come anywhere in the name.
    for part in name.split(" "):
        # Normalize each address part to cehck against normalized parts in dict.
        part = part.strip(",\.").lower()
        # Check each part of the name against the keys in the correction dict        
        if part in mapping.keys():
            # If exists in dict, overwrite that part of the name with the dict value for it.
            part = mapping[part]
        # Assemble each corrected piece of the name back together.
        # Also, capitalize the each address part before adding it back.
        # .capitalize() instead of .title() so 1st stays as 1st instead of converting to 1St 
        after.append(part.capitalize())
    # Return all pieces of the name as a string joined by a space.
    return " ".join(after)

hnl_st_types = audit(HNL_DATA)
abq_st_types = audit(ABQ_DATA)

In [7]:
# Mapping variables were updated: see full list above
for st_type, ways in hnl_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        # Show all street names in way nodes.
        print name, "=>", better_name
 king => King
Enchanted Lakes Shopping Center => Enchanted Lakes Shopping Center
Longs Drugs => Longs Drugs
Lusitania St. => Lusitania Street
Ala Pumalu St => Ala Pumalu Street
Lusitania St => Lusitania Street
Fort Street Mall => Fort Street Mall
McCarthy Mall => Mccarthy Mall
Pualani Way => Pualani Way
Wai Nani Way => Wai Nani Way
Kuaaina Way => Kuaaina Way
Ainakea Way => Ainakea Way
Coelho Way => Coelho Way
Kalakaua Ave => Kalakaua Avenue
Pualei Circle => Pualei Circle
Beach Walk => Beach Walk
Pali Highway => Pali Highway
Kamehameha Highway => Kamehameha Highway
Kalanianaole Highway => Kalanianaole Highway
Kalaniana'ole Highway => Kalaniana'ole Highway
Farrington Highway => Farrington Highway

In [8]:
# Mapping variables were updated: see full list above
for st_type, ways in abq_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        # Only show changed street names in way nodes 
        # since there are a lot more in this city data
        if name != better_name:
            print name, "=>", better_name
# One street would need to be cleaned a lot: 1833 8th Street Northwestalbuquerque Nm 87102
# But almost all have been cleaned pretty well.
indian School Road Northeast => Indian School Road Northeast
Constitution Ave Northeast => Constitution Avenue Northeast
Arvada Ave Northeast => Arvada Avenue Northeast
Campo del Sol Avenue Northeast => Campo Del Sol Avenue Northeast
Campo del Oso Avenue Northeast => Campo Del Oso Avenue Northeast
Gold Between 1st and 2nd => Gold Between 1st And 2nd
10th Street, Northwest => 10th Street Northwest
Central avenue => Central Avenue
Mullhacen Pl => Mullhacen Place
Cental AvenueSW => Cental Avenuesw
1833 8th Street NorthwestAlbuquerque, NM 87102 => 1833 8th Street Northwestalbuquerque Nm 87102
Balloon Museum Drive NE => Balloon Museum Drive Northeast
Paseo Alameda NE => Paseo Alameda Northeast
Uptown Loop Rd NE => Uptown Loop Road Northeast
LOMAS BLVD NE => Lomas Boulevard Northeast
Central Avenue NE => Central Avenue Northeast
Eubank Blvd NE => Eubank Boulevard Northeast
Glendale Ave NE => Glendale Avenue Northeast
Central Ave NE => Central Avenue Northeast
Renaissance Boulevard NE => Renaissance Boulevard Northeast
Eubank NE => Eubank Northeast
8700 Central Ave => 8700 Central Avenue
Lead and Morningside => Lead And Morningside
Valley View Dr NW => Valley View Drive Northwest
3301 Menaul Blvd. NE Suite A => 3301 Menaul Boulevard Northeast Suite A
Juan Tabo NE, Suite A => Juan Tabo Northeast Suite A
8th and Central => 8th And Central
Corner of Uptown Blvd and Uptown Loop => Corner Of Uptown Boulevard And Uptown Loop
Menaul Blvd. NE. => Menaul Boulevard Northeast
Bridge Boulevard SW => Bridge Boulevard Southwest
Eubank Northeast Ste E-18 => Eubank Northeast Suite E-18
Clark Ave SE => Clark Avenue Southeast
Randolph Ave SE => Randolph Avenue Southeast
3400 Crest Ave. SE => 3400 Crest Avenue Southeast
Girard SE => Girard Southeast
16th Street SouthWest => 16th Street Southwest


In [9]:
# Covers cases encountered in cleaning ('96826', 'HI 96819', '96734-9998')

def check_5_digits(new_postal):
    # Make sure postal code is 5 digits
    if len(str(new_postal)) == 5:
        return new_postal
    # Else return postal code with descriptive error message attached.
    else:
        return "NOT 5 DIGITS:"+str(new_postal)     

def correct_postal_codes(postal):
    # Try to convert numbers as intended. 
    # No need to check for conditions with if statements if the vast majority will be valid.
    try:
        new_postal = int(postal)
        # Check for 5 sigits with another function.
        return check_5_digits(new_postal)
    except ValueError as value_error:
        # Check if it's a string with a hyphen containing 4 trailing digits e.g. '96734-9998'
        if '-' in postal:
            # Grab first digits and convert to int.
            postal = int(postal.split('-')[0])
            # Check for 5 sigits with another function.
            return check_5_digits(postal)
        if ' ' in postal:
            postals = postal.split(' ')
            new_postals = [p for p in postals if len(p)==5]
            try:
                # Check the first entry for a valid 5 digit number
                new_postals = int(new_postals[0])
                return check_5_digits(new_postals)
            # Otherwise return the error thrown and values.
            except Exception as err:
                return err, new_postals
        else:
            # Return the ValueError thrown and value if it doesn't meet these criteria
            return ValueError + ":" + str(postal)
    

In [11]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True




def shape_element(element):
    # Make an empty dictionary for the output node/element.
    node = {}
    # Process 'node' or 'way' elements only.
    if element.tag == "node" or element.tag == "way":
        #if element.attrib["id"] == '21442033':
        #    print element.tag
        #    for tag in element.iter("tag"):
        #        print tag.attrib
        address_info = {}
        nd_info = []
        # Add 'node'/'way' as 'type' 
        node["type"] = element.tag
        # Add 'id' as 'id'
        node["id"] = element.attrib["id"]
        # If visible exists, add it to dict
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        # Add 'lat'/'lon' if they ar available
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        # Add version, changeset, timestamp, uid, and user under the root node 'created'
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        # Iterate through the tags of k,v pairs.
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                # print "PROBLEM:", p.group()
                # Do nothing currently
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    # print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    # first 5 char of address attributes should be 'addr:'
                    # If they're not, it's a bad address for this script.
                    # Skip.
                    continue
                else:
                    # If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as
                    # a key and the value from 'v' as the value in our address_info dict.
                    # i.e. 'addr:state' will add 'state'
                    #
                    # Check the postcodes and correct them if applicable before adding into address_info
                    if tag.attrib['k'][5:] == 'postcode':
                        tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                # If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        # If we found 'addr:' info and added it to our address_info dict,
        if address_info != {}:
            # Then add that address_info dict under the node 'address'
            node['address'] = address_info
        # Iterate through the 'nd' nodes if they exist.
        for tag2 in element.iter("nd"):
            # add each entry in a running list.
            nd_info.append(tag2.attrib['ref'])
        # If the resulting list isn't empty,
        if nd_info != []:
            # Add the list under the node 'node_refs'
            node['node_refs'] = nd_info
        return node
    else:
        # If the element isn't 'node' or 'way', just return None.
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In []:
data = process_map("data/honolulu_hawaii.osm", False)
In [10]:
data = process_map("data/albuquerque_new-mexico.osm", False)
In [2]:
## Functions to load our database and collection in pymongo

from pymongo import MongoClient

# Function to return a database of the name specified.
# We want a database named 'project' in this case.
def get_db(db_name):
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

## Function to return the collection we want to use in MongoDB
def get_collection(db, collection):
    collections_db = db[collection]
    return collections_db
In [50]:
## Function to insert json data into MongoDB
def insert_data(json_data, db_collection):
    with open(json_data, 'r') as f:
        ## json.loads() takes a string, while json.load() takes a file-like object.
        ## http://stackoverflow.com/questions/11568246/
        ## /loading-several-text-files-into-mongodb-using-pymongo
        for each_line in f.readlines():
            db_collection.insert(json.loads(each_line))
    print("Complete.")
In [3]:
def map_aggregate(db, collection, pipeline):
    db_collection = db[collection]
    result = db_collection.aggregate(pipeline)
    return result

Which users have contributed the most edits?

How do postcodes look?

We can initially see that some cleanup is needed for the zipcodes. This includes 1. Removing the 4 digit postcode suffix. 2. Removing state letters from postcode 3. Convert to int

After adding an address cleaning function, all zipcodes now adhere to a 5 digit code, but a few (such as 89197 and 87100) with only 1 entry may have been entered incorrectly.


Combining cities into one collection

and restructuring queries.

We can add in the city name to each node for aggregation in a combined database

In [117]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True

def shape_element(element, city_name):
    '''
    Function to pull information for way/node entries from .osm files and 
    store as json records for loading into MongoDB.
    
    Returns data record as a dictionary.
    
    Example structure:
    
    {city_name: value,
     user: value
     type: value,
     id: value,
     visible: value,
     address: {street: value,
               postcode: value,
               housenumber: value,
               housenname: value},
     created: {version: value,
               changeset: value,
               timestamp: value,
               uid: value,
               user: value},
     lat: value,
     lon: value,
     node_refs: value}
    
    '''
    # Make an empty dictionary for the output node/element.
    node = {}
    # Process 'node' or 'way' elements only.
    if element.tag == "node" or element.tag == "way":
        ## Add in the city name to each node before writing
        ## out the json file.
        node["city_name"] = city_name
        # Create a sub-dictionary for the address info.
        address_info = {}
        nd_info = []
        # Add 'node'/'way' as 'type' 
        node["type"] = element.tag
        # Add 'id' as 'id'
        node["id"] = element.attrib["id"]
        # If visible exists, add it to dict
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        # Add 'lat'/'lon' if they ar available
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        # Add version, changeset, timestamp, uid, and user under the root node 'created'
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        # Iterate through the tags of k,v pairs.
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                # print "PROBLEM:", p.group()
                # Do nothing currently
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    # print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    # first 5 char of address attributes should be 'addr:'
                    # If they're not, it's a bad address for this script.
                    # Skip.
                    continue
                else:
                    # If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as
                    # a key and the value from 'v' as the value in our address_info dict.
                    # i.e. 'addr:state' will add 'state'
                    #
                    # Check the postcodes and correct them if applicable before adding into address_info
                    if tag.attrib['k'][5:] == 'postcode':
                        tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                # If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        # If we found 'addr:' info and added it to our address_info dict,
        if address_info != {}:
            # Then add that address_info dict under the node 'address'
            node['address'] = address_info
        # Iterate through the 'nd' nodes if they exist.
        for tag2 in element.iter("nd"):
            # add each entry in a running list.
            nd_info.append(tag2.attrib['ref'])
        # If the resulting list isn't empty,
        if nd_info != []:
            # Add the list under the node 'node_refs'
            node['node_refs'] = nd_info
        return node
    else:
        # If the element isn't 'node' or 'way', just return None.
        return None


def process_map(file_in, city_name, pretty = False):
    '''
    Function to write dictionary record into json file 
    for loading into MongoDB.
    
    Writes dictionaries returned out to a .json file.
    '''
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element, city_name)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [122]:
data = process_map("data/honolulu_hawaii.osm", 'honolulu', False)
In [123]:
data = process_map("data/albuquerque_new-mexico.osm", 'albuquerque', False)
In [4]:
# Get 'project' database
db = get_db('project')

# Get 'cities' collection in the 'project' database
# Put honolulu and albuquerque city data in this collection.
db_cities = get_collection(db, 'cities')
In [125]:
hnl_json_data = 'data/honolulu_hawaii.osm.json'  
insert_data(hnl_json_data, db_cities)
Complete.

In [126]:
abq_json_data = 'data/albuquerque_new-mexico.osm.json' 
insert_data(abq_json_data, db_cities)
Complete.

In [5]:
db.cities
Out[5]:
Collection(Database(MongoClient('localhost', 27017), u'project'), u'cities')

We can either look up the top user contributors overall grouped by city...

In [8]:
result = db.cities.aggregate([{"$match":{"created.user":{"$exists":1}}},
                {"$group": 
                 {"_id": {"City":"$city_name",
                          "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 7 }])
pprint.pprint(result)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 98401, u'User': u'Tom_Holland'},
             {u'City': u'albuquerque',
              u'Count': 88041,
              u'User': u'EdHillsman'},
             {u'City': u'albuquerque', u'Count': 37604, u'User': u'anjbe'},
             {u'City': u'albuquerque', u'Count': 32985, u'User': u'jackbus'},
             {u'City': u'albuquerque',
              u'Count': 31170,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'honolulu', u'Count': 13051, u'User': u'ikiya'},
             {u'City': u'honolulu',
              u'Count': 11205,
              u'User': u'woodpeck_fixbot'}]}

Or return the top user contributors for each city individually.

Which is what we will do, by making a pipeline function to switch between cities.

In [26]:
def make_city_pipeline(city):
    pipeline = [{"$match":{"created.user":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                     "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 98401, u'User': u'Tom_Holland'},
             {u'City': u'honolulu', u'Count': 13051, u'User': u'ikiya'},
             {u'City': u'honolulu',
              u'Count': 11205,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'honolulu', u'Count': 10651, u'User': u'pdunn'},
             {u'City': u'honolulu',
              u'Count': 10251,
              u'User': u'Chris Lawrence'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 88041,
              u'User': u'EdHillsman'},
             {u'City': u'albuquerque', u'Count': 37604, u'User': u'anjbe'},
             {u'City': u'albuquerque', u'Count': 32985, u'User': u'jackbus'},
             {u'City': u'albuquerque',
              u'Count': 31170,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'albuquerque',
              u'Count': 10406,
              u'User': u'greenv505'}]}


In [24]:
## Honolulu area and Albuquerque are represented in this dataset.
## Some other areas across Oahu are also sparsely represented
def make_city_pipeline(city):
    pipeline = [{"$match":{"address.city":{"$exists":1},
                           "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                         "City_name":"$address.city"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "Name":"$_id.City_name",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'Count': 429, u'Name': u'Honolulu'},
             {u'Count': 6, u'Name': u'Kailua'},
             {u'Count': 5, u'Name': u'honolulu'},
             {u'Count': 3, u'Name': u'Honolulu, HI'},
             {u'Count': 3, u'Name': u'Honlulu'},
             {u'Count': 3, u'Name': u'Haleiwa'},
             {u'Count': 1, u'Name': u'Hauula'},
             {u'Count': 1, u'Name': u'Aiea'},
             {u'Count': 1, u'Name': u'honolulu, HI'},
             {u'Count': 1, u'Name': u'Mililani'}]}

{u'ok': 1.0,
 u'result': [{u'Count': 1445, u'Name': u'Albuquerque'},
             {u'Count': 2, u'Name': u'albuquerque'},
             {u'Count': 2, u'Name': u'Albuquerque, NM'},
             {u'Count': 1, u'Name': u'Kirtland Air Force Base'}]}


In [13]:
# Parking is the highest reported amenity by far in both cities.
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                    "Amenity":"$amenity"},
                            "count": {"$sum": 1}}},
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "Amenity":"$_id.Amenity",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking', u'City': u'honolulu', u'Count': 280},
             {u'Amenity': u'restaurant', u'City': u'honolulu', u'Count': 123},
             {u'Amenity': u'school', u'City': u'honolulu', u'Count': 79},
             {u'Amenity': u'fast_food', u'City': u'honolulu', u'Count': 43},
             {u'Amenity': u'cafe', u'City': u'honolulu', u'Count': 37}]}

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking',
              u'City': u'albuquerque',
              u'Count': 1270},
             {u'Amenity': u'school', u'City': u'albuquerque', u'Count': 258},
             {u'Amenity': u'place_of_worship',
              u'City': u'albuquerque',
              u'Count': 227},
             {u'Amenity': u'restaurant',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'bicycle_parking',
              u'City': u'albuquerque',
              u'Count': 221}]}


In [7]:
def make_city_pipeline(city):
    pipeline = [{"$match":{"address.postcode":{"$exists":1},
                                          "city_name":city}},   
                 {"$group": {"_id": {"City":"$city_name",
                                     "Zip":"$address.postcode"},
                            "count": {"$sum": 1}}},
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "Zipcode":"$_id.Zip",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 1 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 219, u'Zipcode': 96815}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 476, u'Zipcode': 87123}]}


In [13]:
# Total nodes/records in database.
print "Both Cities:", db.cities.find().count() 
print "Honolulu:", db.cities.find({'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'city_name':'albuquerque'}).count()
Both Cities: 489274
Honolulu: 227752
Albuquerque: 261522

In [15]:
# Number of node nodes.
print "Both Cities:", db.cities.find({"type":"node"}).count()
print "Honolulu:", db.cities.find({"type":"node",
                                    'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({"type":"node",
                                    'city_name':'albuquerque'}).count()
Both Cities: 432587
Honolulu: 206262
Albuquerque: 226325

In [16]:
# Number of way nodes.
print "Both Cities:", db.cities.find({'type':'way'}).count()
print "Honolulu:", db.cities.find({'type':'way',
                                  'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'type':'way',
                                  'city_name':'albuquerque'}).count()
Both Cities: 56648
Honolulu: 21459
Albuquerque: 35189

In [17]:
# Number of constributors.
print "Constributors:", len(db.cities.distinct("created.user"))
Constributors: 611


In [85]:
## Find the most popular amenities
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                          "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Amenity":"$amenity"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Amenity":"$_id.Amenity",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking', u'City': u'honolulu', u'Count': 280},
             {u'Amenity': u'restaurant', u'City': u'honolulu', u'Count': 123},
             {u'Amenity': u'school', u'City': u'honolulu', u'Count': 79},
             {u'Amenity': u'fast_food', u'City': u'honolulu', u'Count': 43},
             {u'Amenity': u'cafe', u'City': u'honolulu', u'Count': 37},
             {u'Amenity': u'fire_station',
              u'City': u'honolulu',
              u'Count': 36},
             {u'Amenity': u'library', u'City': u'honolulu', u'Count': 31},
             {u'Amenity': u'waste_basket',
              u'City': u'honolulu',
              u'Count': 28},
             {u'Amenity': u'toilets', u'City': u'honolulu', u'Count': 26},
             {u'Amenity': u'place_of_worship',
              u'City': u'honolulu',
              u'Count': 23}]}

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking',
              u'City': u'albuquerque',
              u'Count': 1270},
             {u'Amenity': u'school', u'City': u'albuquerque', u'Count': 258},
             {u'Amenity': u'place_of_worship',
              u'City': u'albuquerque',
              u'Count': 227},
             {u'Amenity': u'restaurant',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'bicycle_parking',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'fast_food',
              u'City': u'albuquerque',
              u'Count': 171},
             {u'Amenity': u'fuel', u'City': u'albuquerque', u'Count': 113},
             {u'Amenity': u'cafe', u'City': u'albuquerque', u'Count': 56},
             {u'Amenity': u'post_box', u'City': u'albuquerque', u'Count': 53},
             {u'Amenity': u'bank', u'City': u'albuquerque', u'Count': 46}]}


In [86]:
## Find the most popular places of worship
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                  "amenity":"place_of_worship",
                                  "city_name":city}},
                       {"$group":{"_id": {"City":"$city_name",
                                          "Religion":"$religion"},
                                  "count":{"$sum":1}}},
                       {"$project":{"_id":0,
                                    "City":"$_id.City",
                                    "Religion":"$_id.Religion",
                                    "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 14, u'Religion': u'christian'},
             {u'City': u'honolulu', u'Count': 5, u'Religion': u'buddhist'},
             {u'City': u'honolulu', u'Count': 3},
             {u'City': u'honolulu', u'Count': 1, u'Religion': u'muslim'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 186,
              u'Religion': u'christian'},
             {u'City': u'albuquerque', u'Count': 36},
             {u'City': u'albuquerque', u'Count': 3, u'Religion': u'jewish'},
             {u'City': u'albuquerque', u'Count': 1, u'Religion': u'buddhist'},
             {u'City': u'albuquerque', u'Count': 1, u'Religion': u'sikh'}]}


In [30]:
## Find the most popular restaurants
## We leave blank cuisines in to get a feel for how many restaurants 
## were reported without a cuisine filled in. 
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"restaurant",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Food":"$cuisine"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Food":"$_id.Food",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 87},
             {u'City': u'honolulu', u'Count': 7, u'Food': u'pizza'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'regional'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'japanese'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'chinese'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'burger'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 134},
             {u'City': u'albuquerque', u'Count': 16, u'Food': u'mexican'},
             {u'City': u'albuquerque', u'Count': 10, u'Food': u'pizza'},
             {u'City': u'albuquerque', u'Count': 6, u'Food': u'american'},
             {u'City': u'albuquerque', u'Count': 5, u'Food': u'regional'},
             {u'City': u'albuquerque', u'Count': 4, u'Food': u'burger'}]}


In [88]:
## Find the most popular fast food joints
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"fast_food",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Food":"$cuisine"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Food":"$_id.Food",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 19},
             {u'City': u'honolulu', u'Count': 13, u'Food': u'burger'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'sandwich'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'sushi'},
             {u'City': u'honolulu', u'Count': 1, u'Food': u'thai'},
             {u'City': u'honolulu', u'Count': 1, u'Food': u'pizza'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 98},
             {u'City': u'albuquerque', u'Count': 31, u'Food': u'burger'},
             {u'City': u'albuquerque', u'Count': 16, u'Food': u'sandwich'},
             {u'City': u'albuquerque', u'Count': 6, u'Food': u'pizza'},
             {u'City': u'albuquerque', u'Count': 4, u'Food': u'mexican'},
             {u'City': u'albuquerque', u'Count': 3, u'Food': u'chicken'}]}


In [89]:
## Find the names of the most popular fast food joints
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"fast_food",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Name":"$name"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Name":"$_id.Name",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 8, u'Name': u"McDonald's"},
             {u'City': u'honolulu', u'Count': 4, u'Name': u'Subway'},
             {u'City': u'honolulu', u'Count': 3, u'Name': u'Burger King'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'KFC'},
             {u'City': u'honolulu',
              u'Count': 2,
              u'Name': u'Loco Moco Drive-Inn'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'Taco Bell'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 23, u'Name': u'Subway'},
             {u'City': u'albuquerque',
              u'Count': 12,
              u'Name': u"Blake's Lotaburger"},
             {u'City': u'albuquerque', u'Count': 11, u'Name': u"McDonald's"},
             {u'City': u'albuquerque', u'Count': 9, u'Name': u"Wendy's"},
             {u'City': u'albuquerque', u'Count': 8, u'Name': u'Sonic'},
             {u'City': u'albuquerque', u'Count': 7, u'Name': u'Burger King'}]}


In [90]:
## What are the most popular shops.
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                                          "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Shop":"$shop"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Shop":"$_id.Shop",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 50, u'Shop': u'supermarket'},
             {u'City': u'honolulu', u'Count': 24, u'Shop': u'convenience'},
             {u'City': u'honolulu', u'Count': 18, u'Shop': u'clothes'},
             {u'City': u'honolulu', u'Count': 7, u'Shop': u'mall'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'hairdresser'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'bakery'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'hifi'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'jewelry'},
             {u'City': u'honolulu', u'Count': 3, u'Shop': u'sports'},
             {u'City': u'honolulu', u'Count': 3, u'Shop': u'garden_centre'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 66, u'Shop': u'convenience'},
             {u'City': u'albuquerque', u'Count': 46, u'Shop': u'supermarket'},
             {u'City': u'albuquerque', u'Count': 22, u'Shop': u'car_repair'},
             {u'City': u'albuquerque', u'Count': 18, u'Shop': u'clothes'},
             {u'City': u'albuquerque', u'Count': 15, u'Shop': u'hairdresser'},
             {u'City': u'albuquerque', u'Count': 14, u'Shop': u'car'},
             {u'City': u'albuquerque',
              u'Count': 12,
              u'Shop': u'department_store'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'furniture'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'bicycle'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'mall'}]}


In [93]:
## Find the names of the most popular supermarkets
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                           "city_name":city,
                           "shop":"supermarket"}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Supermarket":"$name"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Supermarket":"$_id.Supermarket",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":5}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 4, u'Supermarket': u'Foodland'},
             {u'City': u'honolulu', u'Count': 2, u'Supermarket': u'Ross'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Don Quijote'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Safeway 1263'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Malama Market'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 6,
              u'Supermarket': u"Smith's"},
             {u'City': u'albuquerque', u'Count': 5},
             {u'City': u'albuquerque',
              u'Count': 4,
              u'Supermarket': u"Albertson's"},
             {u'City': u'albuquerque',
              u'Count': 2,
              u'Supermarket': u"Trader Joe's"},
             {u'City': u'albuquerque',
              u'Count': 2,
              u'Supermarket': u'Walmart'}]}


In [94]:
## Find the names of the most popular convenience stores
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                           "city_name":city,
                           "shop":"convenience"}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Name":"$name"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Name":"$_id.Name",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":5}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 7, u'Name': u'ABC Store'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'AT&T Store'},
             {u'City': u'honolulu', u'Count': 1, u'Name': u'7-Eleven'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Name': u'AT&T Store Kapahulu'},
             {u'City': u'honolulu', u'Count': 1, u'Name': u'7-ELEVEN    76'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 14, u'Name': u'Circle K'},
             {u'City': u'albuquerque', u'Count': 14},
             {u'City': u'albuquerque', u'Count': 12, u'Name': u'7-11'},
             {u'City': u'albuquerque', u'Count': 5, u'Name': u'Giant'},
             {u'City': u'albuquerque', u'Count': 2, u'Name': u"Bubba's"}]}