import os
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
import bson
import pymongo

# Data used - Weekly OSM Metro Extracts from:
# https://mapzen.com/metro-extracts/
# https://s3.amazonaws.com/metro-extracts.mapzen.com/albuquerque_new-mexico.osm.bz2
# https://s3.amazonaws.com/metro-extracts.mapzen.com/honolulu_hawaii.osm.bz2

DATADIR = "data"
DATAFILE = "honolulu_hawaii.osm"
DATAFILE2 = "albuquerque_new-mexico.osm"

HNL_DATA = os.path.join(DATADIR, DATAFILE)
ABQ_DATA = os.path.join(DATADIR, DATAFILE2)

def count_tags(filename):
    counts = collections.defaultdict(int)
    for line in ET.iterparse(filename, events=("start",)):
        current = line[1].tag
        counts[current] += 1
    return counts

hnl_tags = count_tags(HNL_DATA)
abq_tags = count_tags(ABQ_DATA)
pprint.pprint(hnl_tags)
pprint.pprint(abq_tags)

defaultdict(<type 'int'>, {'node': 206288, 'nd': 245614, 'bounds': 1, 'member': 1160, 'tag': 104987, 'osm': 1, 'way': 21464, 'relation': 268})
defaultdict(<type 'int'>, {'node': 226327, 'nd': 283147, 'bounds': 1, 'member': 1398, 'tag': 208869, 'osm': 1, 'way': 35195, 'relation': 117})

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

hnl_all_keys = process_map(HNL_DATA)
abq_all_keys = process_map(ABQ_DATA)
print hnl_all_keys
print abq_all_keys

{'problemchars': 0, 'lower': 52301, 'other': 2350, 'lower_colon': 50336}
{'problemchars': 7, 'lower': 90927, 'other': 2224, 'lower_colon': 115711}

This shows many keys are present with semicolons such as 'addr:postcode'

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "pl": "Place",
            "st": "Street",
            "ave": "Avenue",
            "rd": "Road",
            "w": "West",
            "n": "North",
            "s": "South",
            "e": "East",
            "blvd":"Boulevard",
            "sr": "Drive",
            "ct": "Court",
            "ne": "Northeast",
            "se": "Southeast",
            "nw": "Northwest",
            "sw": "Southwest",
            "dr": "Drive",
            "sq": "Square",
            "ln": "Lane",
            "trl": "Trail",
            "pkwy": "Parkway",
            "ste": "Suite",
            "lp": "Loop",
            "hwy": "Highway"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    after = []
    # Split name string to test each part of the name;
    # Replacements may come anywhere in the name.
    for part in name.split(" "):
        # Normalize each address part to cehck against normalized parts in dict.
        part = part.strip(",\.").lower()
        # Check each part of the name against the keys in the correction dict        
        if part in mapping.keys():
            # If exists in dict, overwrite that part of the name with the dict value for it.
            part = mapping[part]
        # Assemble each corrected piece of the name back together.
        # Also, capitalize the each address part before adding it back.
        # .capitalize() instead of .title() so 1st stays as 1st instead of converting to 1St 
        after.append(part.capitalize())
    # Return all pieces of the name as a string joined by a space.
    return " ".join(after)

hnl_st_types = audit(HNL_DATA)
abq_st_types = audit(ABQ_DATA)

Back to top

# Mapping variables were updated: see full list above
for st_type, ways in hnl_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        # Show all street names in way nodes.
        print name, "=>", better_name

 king => King
Enchanted Lakes Shopping Center => Enchanted Lakes Shopping Center
Longs Drugs => Longs Drugs
Lusitania St. => Lusitania Street
Ala Pumalu St => Ala Pumalu Street
Lusitania St => Lusitania Street
Fort Street Mall => Fort Street Mall
McCarthy Mall => Mccarthy Mall
Pualani Way => Pualani Way
Wai Nani Way => Wai Nani Way
Kuaaina Way => Kuaaina Way
Ainakea Way => Ainakea Way
Coelho Way => Coelho Way
Kalakaua Ave => Kalakaua Avenue
Pualei Circle => Pualei Circle
Beach Walk => Beach Walk
Pali Highway => Pali Highway
Kamehameha Highway => Kamehameha Highway
Kalanianaole Highway => Kalanianaole Highway
Kalaniana'ole Highway => Kalaniana'ole Highway
Farrington Highway => Farrington Highway

# Mapping variables were updated: see full list above
for st_type, ways in abq_st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        # Only show changed street names in way nodes 
        # since there are a lot more in this city data
        if name != better_name:
            print name, "=>", better_name
# One street would need to be cleaned a lot: 1833 8th Street Northwestalbuquerque Nm 87102
# But almost all have been cleaned pretty well.

indian School Road Northeast => Indian School Road Northeast
Constitution Ave Northeast => Constitution Avenue Northeast
Arvada Ave Northeast => Arvada Avenue Northeast
Campo del Sol Avenue Northeast => Campo Del Sol Avenue Northeast
Campo del Oso Avenue Northeast => Campo Del Oso Avenue Northeast
Gold Between 1st and 2nd => Gold Between 1st And 2nd
10th Street, Northwest => 10th Street Northwest
Central avenue => Central Avenue
Mullhacen Pl => Mullhacen Place
Cental AvenueSW => Cental Avenuesw
1833 8th Street NorthwestAlbuquerque, NM 87102 => 1833 8th Street Northwestalbuquerque Nm 87102
Balloon Museum Drive NE => Balloon Museum Drive Northeast
Paseo Alameda NE => Paseo Alameda Northeast
Uptown Loop Rd NE => Uptown Loop Road Northeast
LOMAS BLVD NE => Lomas Boulevard Northeast
Central Avenue NE => Central Avenue Northeast
Eubank Blvd NE => Eubank Boulevard Northeast
Glendale Ave NE => Glendale Avenue Northeast
Central Ave NE => Central Avenue Northeast
Renaissance Boulevard NE => Renaissance Boulevard Northeast
Eubank NE => Eubank Northeast
8700 Central Ave => 8700 Central Avenue
Lead and Morningside => Lead And Morningside
Valley View Dr NW => Valley View Drive Northwest
3301 Menaul Blvd. NE Suite A => 3301 Menaul Boulevard Northeast Suite A
Juan Tabo NE, Suite A => Juan Tabo Northeast Suite A
8th and Central => 8th And Central
Corner of Uptown Blvd and Uptown Loop => Corner Of Uptown Boulevard And Uptown Loop
Menaul Blvd. NE. => Menaul Boulevard Northeast
Bridge Boulevard SW => Bridge Boulevard Southwest
Eubank Northeast Ste E-18 => Eubank Northeast Suite E-18
Clark Ave SE => Clark Avenue Southeast
Randolph Ave SE => Randolph Avenue Southeast
3400 Crest Ave. SE => 3400 Crest Avenue Southeast
Girard SE => Girard Southeast
16th Street SouthWest => 16th Street Southwest

# Covers cases encountered in cleaning ('96826', 'HI 96819', '96734-9998')

def check_5_digits(new_postal):
    # Make sure postal code is 5 digits
    if len(str(new_postal)) == 5:
        return new_postal
    # Else return postal code with descriptive error message attached.
    else:
        return "NOT 5 DIGITS:"+str(new_postal)     

def correct_postal_codes(postal):
    # Try to convert numbers as intended. 
    # No need to check for conditions with if statements if the vast majority will be valid.
    try:
        new_postal = int(postal)
        # Check for 5 sigits with another function.
        return check_5_digits(new_postal)
    except ValueError as value_error:
        # Check if it's a string with a hyphen containing 4 trailing digits e.g. '96734-9998'
        if '-' in postal:
            # Grab first digits and convert to int.
            postal = int(postal.split('-')[0])
            # Check for 5 sigits with another function.
            return check_5_digits(postal)
        if ' ' in postal:
            postals = postal.split(' ')
            new_postals = [p for p in postals if len(p)==5]
            try:
                # Check the first entry for a valid 5 digit number
                new_postals = int(new_postals[0])
                return check_5_digits(new_postals)
            # Otherwise return the error thrown and values.
            except Exception as err:
                return err, new_postals
        else:
            # Return the ValueError thrown and value if it doesn't meet these criteria
            return ValueError + ":" + str(postal)

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True




def shape_element(element):
    # Make an empty dictionary for the output node/element.
    node = {}
    # Process 'node' or 'way' elements only.
    if element.tag == "node" or element.tag == "way":
        #if element.attrib["id"] == '21442033':
        #    print element.tag
        #    for tag in element.iter("tag"):
        #        print tag.attrib
        address_info = {}
        nd_info = []
        # Add 'node'/'way' as 'type' 
        node["type"] = element.tag
        # Add 'id' as 'id'
        node["id"] = element.attrib["id"]
        # If visible exists, add it to dict
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        # Add 'lat'/'lon' if they ar available
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        # Add version, changeset, timestamp, uid, and user under the root node 'created'
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        # Iterate through the tags of k,v pairs.
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                # print "PROBLEM:", p.group()
                # Do nothing currently
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    # print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    # first 5 char of address attributes should be 'addr:'
                    # If they're not, it's a bad address for this script.
                    # Skip.
                    continue
                else:
                    # If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as
                    # a key and the value from 'v' as the value in our address_info dict.
                    # i.e. 'addr:state' will add 'state'
                    #
                    # Check the postcodes and correct them if applicable before adding into address_info
                    if tag.attrib['k'][5:] == 'postcode':
                        tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                # If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        # If we found 'addr:' info and added it to our address_info dict,
        if address_info != {}:
            # Then add that address_info dict under the node 'address'
            node['address'] = address_info
        # Iterate through the 'nd' nodes if they exist.
        for tag2 in element.iter("nd"):
            # add each entry in a running list.
            nd_info.append(tag2.attrib['ref'])
        # If the resulting list isn't empty,
        if nd_info != []:
            # Add the list under the node 'node_refs'
            node['node_refs'] = nd_info
        return node
    else:
        # If the element isn't 'node' or 'way', just return None.
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

Back to top

data = process_map("data/honolulu_hawaii.osm", False)

data = process_map("data/albuquerque_new-mexico.osm", False)

## Functions to load our database and collection in pymongo

from pymongo import MongoClient

# Function to return a database of the name specified.
# We want a database named 'project' in this case.
def get_db(db_name):
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

## Function to return the collection we want to use in MongoDB
def get_collection(db, collection):
    collections_db = db[collection]
    return collections_db

## Function to insert json data into MongoDB
def insert_data(json_data, db_collection):
    with open(json_data, 'r') as f:
        ## json.loads() takes a string, while json.load() takes a file-like object.
        ## http://stackoverflow.com/questions/11568246/
        ## /loading-several-text-files-into-mongodb-using-pymongo
        for each_line in f.readlines():
            db_collection.insert(json.loads(each_line))
    print("Complete.")

def map_aggregate(db, collection, pipeline):
    db_collection = db[collection]
    result = db_collection.aggregate(pipeline)
    return result

Which users have contributed the most edits?

What are the most popular amenities?

How do postcodes look?

We can initially see that some cleanup is needed for the zipcodes. This includes 1. Removing the 4 digit postcode suffix. 2. Removing state letters from postcode 3. Convert to int

After adding an address cleaning function, all zipcodes now adhere to a 5 digit code, but a few (such as 89197 and 87100) with only 1 entry may have been entered incorrectly.

Back to top

Combining cities into one collection

and restructuring queries.

We can add in the city name to each node for aggregation in a combined database

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True

def shape_element(element, city_name):
    '''
    Function to pull information for way/node entries from .osm files and 
    store as json records for loading into MongoDB.
    
    Returns data record as a dictionary.
    
    Example structure:
    
    {city_name: value,
     user: value
     type: value,
     id: value,
     visible: value,
     address: {street: value,
               postcode: value,
               housenumber: value,
               housenname: value},
     created: {version: value,
               changeset: value,
               timestamp: value,
               uid: value,
               user: value},
     lat: value,
     lon: value,
     node_refs: value}
    
    '''
    # Make an empty dictionary for the output node/element.
    node = {}
    # Process 'node' or 'way' elements only.
    if element.tag == "node" or element.tag == "way":
        ## Add in the city name to each node before writing
        ## out the json file.
        node["city_name"] = city_name
        # Create a sub-dictionary for the address info.
        address_info = {}
        nd_info = []
        # Add 'node'/'way' as 'type' 
        node["type"] = element.tag
        # Add 'id' as 'id'
        node["id"] = element.attrib["id"]
        # If visible exists, add it to dict
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        # Add 'lat'/'lon' if they ar available
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        # Add version, changeset, timestamp, uid, and user under the root node 'created'
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        # Iterate through the tags of k,v pairs.
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                # print "PROBLEM:", p.group()
                # Do nothing currently
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    # print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    # first 5 char of address attributes should be 'addr:'
                    # If they're not, it's a bad address for this script.
                    # Skip.
                    continue
                else:
                    # If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as
                    # a key and the value from 'v' as the value in our address_info dict.
                    # i.e. 'addr:state' will add 'state'
                    #
                    # Check the postcodes and correct them if applicable before adding into address_info
                    if tag.attrib['k'][5:] == 'postcode':
                        tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                # If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        # If we found 'addr:' info and added it to our address_info dict,
        if address_info != {}:
            # Then add that address_info dict under the node 'address'
            node['address'] = address_info
        # Iterate through the 'nd' nodes if they exist.
        for tag2 in element.iter("nd"):
            # add each entry in a running list.
            nd_info.append(tag2.attrib['ref'])
        # If the resulting list isn't empty,
        if nd_info != []:
            # Add the list under the node 'node_refs'
            node['node_refs'] = nd_info
        return node
    else:
        # If the element isn't 'node' or 'way', just return None.
        return None


def process_map(file_in, city_name, pretty = False):
    '''
    Function to write dictionary record into json file 
    for loading into MongoDB.
    
    Writes dictionaries returned out to a .json file.
    '''
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element, city_name)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

data = process_map("data/honolulu_hawaii.osm", 'honolulu', False)

data = process_map("data/albuquerque_new-mexico.osm", 'albuquerque', False)

# Get 'project' database
db = get_db('project')

# Get 'cities' collection in the 'project' database
# Put honolulu and albuquerque city data in this collection.
db_cities = get_collection(db, 'cities')

hnl_json_data = 'data/honolulu_hawaii.osm.json'  
insert_data(hnl_json_data, db_cities)

Complete.

abq_json_data = 'data/albuquerque_new-mexico.osm.json' 
insert_data(abq_json_data, db_cities)

Complete.

db.cities

Collection(Database(MongoClient('localhost', 27017), u'project'), u'cities')

We can either look up the top user contributors overall grouped by city...

result = db.cities.aggregate([{"$match":{"created.user":{"$exists":1}}},
                {"$group": 
                 {"_id": {"City":"$city_name",
                          "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 7 }])
pprint.pprint(result)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 98401, u'User': u'Tom_Holland'},
             {u'City': u'albuquerque',
              u'Count': 88041,
              u'User': u'EdHillsman'},
             {u'City': u'albuquerque', u'Count': 37604, u'User': u'anjbe'},
             {u'City': u'albuquerque', u'Count': 32985, u'User': u'jackbus'},
             {u'City': u'albuquerque',
              u'Count': 31170,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'honolulu', u'Count': 13051, u'User': u'ikiya'},
             {u'City': u'honolulu',
              u'Count': 11205,
              u'User': u'woodpeck_fixbot'}]}

Or return the top user contributors for each city individually.

Which is what we will do, by making a pipeline function to switch between cities.

def make_city_pipeline(city):
    pipeline = [{"$match":{"created.user":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                     "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 98401, u'User': u'Tom_Holland'},
             {u'City': u'honolulu', u'Count': 13051, u'User': u'ikiya'},
             {u'City': u'honolulu',
              u'Count': 11205,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'honolulu', u'Count': 10651, u'User': u'pdunn'},
             {u'City': u'honolulu',
              u'Count': 10251,
              u'User': u'Chris Lawrence'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 88041,
              u'User': u'EdHillsman'},
             {u'City': u'albuquerque', u'Count': 37604, u'User': u'anjbe'},
             {u'City': u'albuquerque', u'Count': 32985, u'User': u'jackbus'},
             {u'City': u'albuquerque',
              u'Count': 31170,
              u'User': u'woodpeck_fixbot'},
             {u'City': u'albuquerque',
              u'Count': 10406,
              u'User': u'greenv505'}]}

## Honolulu area and Albuquerque are represented in this dataset.
## Some other areas across Oahu are also sparsely represented
def make_city_pipeline(city):
    pipeline = [{"$match":{"address.city":{"$exists":1},
                           "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                         "City_name":"$address.city"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "Name":"$_id.City_name",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'Count': 429, u'Name': u'Honolulu'},
             {u'Count': 6, u'Name': u'Kailua'},
             {u'Count': 5, u'Name': u'honolulu'},
             {u'Count': 3, u'Name': u'Honolulu, HI'},
             {u'Count': 3, u'Name': u'Honlulu'},
             {u'Count': 3, u'Name': u'Haleiwa'},
             {u'Count': 1, u'Name': u'Hauula'},
             {u'Count': 1, u'Name': u'Aiea'},
             {u'Count': 1, u'Name': u'honolulu, HI'},
             {u'Count': 1, u'Name': u'Mililani'}]}

{u'ok': 1.0,
 u'result': [{u'Count': 1445, u'Name': u'Albuquerque'},
             {u'Count': 2, u'Name': u'albuquerque'},
             {u'Count': 2, u'Name': u'Albuquerque, NM'},
             {u'Count': 1, u'Name': u'Kirtland Air Force Base'}]}

# Parking is the highest reported amenity by far in both cities.
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                    "Amenity":"$amenity"},
                            "count": {"$sum": 1}}},
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "Amenity":"$_id.Amenity",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking', u'City': u'honolulu', u'Count': 280},
             {u'Amenity': u'restaurant', u'City': u'honolulu', u'Count': 123},
             {u'Amenity': u'school', u'City': u'honolulu', u'Count': 79},
             {u'Amenity': u'fast_food', u'City': u'honolulu', u'Count': 43},
             {u'Amenity': u'cafe', u'City': u'honolulu', u'Count': 37}]}

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking',
              u'City': u'albuquerque',
              u'Count': 1270},
             {u'Amenity': u'school', u'City': u'albuquerque', u'Count': 258},
             {u'Amenity': u'place_of_worship',
              u'City': u'albuquerque',
              u'Count': 227},
             {u'Amenity': u'restaurant',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'bicycle_parking',
              u'City': u'albuquerque',
              u'Count': 221}]}

def make_city_pipeline(city):
    pipeline = [{"$match":{"address.postcode":{"$exists":1},
                                          "city_name":city}},   
                 {"$group": {"_id": {"City":"$city_name",
                                     "Zip":"$address.postcode"},
                            "count": {"$sum": 1}}},
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "Zipcode":"$_id.Zip",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 1 }]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 219, u'Zipcode': 96815}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 476, u'Zipcode': 87123}]}

# Total nodes/records in database.
print "Both Cities:", db.cities.find().count() 
print "Honolulu:", db.cities.find({'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'city_name':'albuquerque'}).count()

Both Cities: 489274
Honolulu: 227752
Albuquerque: 261522

# Number of node nodes.
print "Both Cities:", db.cities.find({"type":"node"}).count()
print "Honolulu:", db.cities.find({"type":"node",
                                    'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({"type":"node",
                                    'city_name':'albuquerque'}).count()

Both Cities: 432587
Honolulu: 206262
Albuquerque: 226325

# Number of way nodes.
print "Both Cities:", db.cities.find({'type':'way'}).count()
print "Honolulu:", db.cities.find({'type':'way',
                                  'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'type':'way',
                                  'city_name':'albuquerque'}).count()

Both Cities: 56648
Honolulu: 21459
Albuquerque: 35189

# Number of constributors.
print "Constributors:", len(db.cities.distinct("created.user"))

Constributors: 611

## Find the most popular amenities
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                          "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Amenity":"$amenity"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Amenity":"$_id.Amenity",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking', u'City': u'honolulu', u'Count': 280},
             {u'Amenity': u'restaurant', u'City': u'honolulu', u'Count': 123},
             {u'Amenity': u'school', u'City': u'honolulu', u'Count': 79},
             {u'Amenity': u'fast_food', u'City': u'honolulu', u'Count': 43},
             {u'Amenity': u'cafe', u'City': u'honolulu', u'Count': 37},
             {u'Amenity': u'fire_station',
              u'City': u'honolulu',
              u'Count': 36},
             {u'Amenity': u'library', u'City': u'honolulu', u'Count': 31},
             {u'Amenity': u'waste_basket',
              u'City': u'honolulu',
              u'Count': 28},
             {u'Amenity': u'toilets', u'City': u'honolulu', u'Count': 26},
             {u'Amenity': u'place_of_worship',
              u'City': u'honolulu',
              u'Count': 23}]}

{u'ok': 1.0,
 u'result': [{u'Amenity': u'parking',
              u'City': u'albuquerque',
              u'Count': 1270},
             {u'Amenity': u'school', u'City': u'albuquerque', u'Count': 258},
             {u'Amenity': u'place_of_worship',
              u'City': u'albuquerque',
              u'Count': 227},
             {u'Amenity': u'restaurant',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'bicycle_parking',
              u'City': u'albuquerque',
              u'Count': 221},
             {u'Amenity': u'fast_food',
              u'City': u'albuquerque',
              u'Count': 171},
             {u'Amenity': u'fuel', u'City': u'albuquerque', u'Count': 113},
             {u'Amenity': u'cafe', u'City': u'albuquerque', u'Count': 56},
             {u'Amenity': u'post_box', u'City': u'albuquerque', u'Count': 53},
             {u'Amenity': u'bank', u'City': u'albuquerque', u'Count': 46}]}

## Find the most popular places of worship
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                  "amenity":"place_of_worship",
                                  "city_name":city}},
                       {"$group":{"_id": {"City":"$city_name",
                                          "Religion":"$religion"},
                                  "count":{"$sum":1}}},
                       {"$project":{"_id":0,
                                    "City":"$_id.City",
                                    "Religion":"$_id.Religion",
                                    "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 14, u'Religion': u'christian'},
             {u'City': u'honolulu', u'Count': 5, u'Religion': u'buddhist'},
             {u'City': u'honolulu', u'Count': 3},
             {u'City': u'honolulu', u'Count': 1, u'Religion': u'muslim'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 186,
              u'Religion': u'christian'},
             {u'City': u'albuquerque', u'Count': 36},
             {u'City': u'albuquerque', u'Count': 3, u'Religion': u'jewish'},
             {u'City': u'albuquerque', u'Count': 1, u'Religion': u'buddhist'},
             {u'City': u'albuquerque', u'Count': 1, u'Religion': u'sikh'}]}

## Find the most popular restaurants
## We leave blank cuisines in to get a feel for how many restaurants 
## were reported without a cuisine filled in. 
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"restaurant",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Food":"$cuisine"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Food":"$_id.Food",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 87},
             {u'City': u'honolulu', u'Count': 7, u'Food': u'pizza'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'regional'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'japanese'},
             {u'City': u'honolulu', u'Count': 3, u'Food': u'chinese'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'burger'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 134},
             {u'City': u'albuquerque', u'Count': 16, u'Food': u'mexican'},
             {u'City': u'albuquerque', u'Count': 10, u'Food': u'pizza'},
             {u'City': u'albuquerque', u'Count': 6, u'Food': u'american'},
             {u'City': u'albuquerque', u'Count': 5, u'Food': u'regional'},
             {u'City': u'albuquerque', u'Count': 4, u'Food': u'burger'}]}

## Find the most popular fast food joints
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"fast_food",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Food":"$cuisine"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Food":"$_id.Food",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 19},
             {u'City': u'honolulu', u'Count': 13, u'Food': u'burger'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'sandwich'},
             {u'City': u'honolulu', u'Count': 2, u'Food': u'sushi'},
             {u'City': u'honolulu', u'Count': 1, u'Food': u'thai'},
             {u'City': u'honolulu', u'Count': 1, u'Food': u'pizza'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 98},
             {u'City': u'albuquerque', u'Count': 31, u'Food': u'burger'},
             {u'City': u'albuquerque', u'Count': 16, u'Food': u'sandwich'},
             {u'City': u'albuquerque', u'Count': 6, u'Food': u'pizza'},
             {u'City': u'albuquerque', u'Count': 4, u'Food': u'mexican'},
             {u'City': u'albuquerque', u'Count': 3, u'Food': u'chicken'}]}

## Find the names of the most popular fast food joints
def make_city_pipeline(city):
    pipeline = [{"$match":{"amenity":{"$exists":1},
                                 "amenity":"fast_food",
                                 "city_name":city}},
                      {"$group":{"_id":{"City":"$city_name",
                                        "Name":"$name"},
                                 "count":{"$sum":1}}},
                      {"$project":{"_id":0,
                                  "City":"$_id.City",
                                  "Name":"$_id.Name",
                                  "Count":"$count"}},
                      {"$sort":{"Count":-1}}, 
                      {"$limit":6}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 8, u'Name': u"McDonald's"},
             {u'City': u'honolulu', u'Count': 4, u'Name': u'Subway'},
             {u'City': u'honolulu', u'Count': 3, u'Name': u'Burger King'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'KFC'},
             {u'City': u'honolulu',
              u'Count': 2,
              u'Name': u'Loco Moco Drive-Inn'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'Taco Bell'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 23, u'Name': u'Subway'},
             {u'City': u'albuquerque',
              u'Count': 12,
              u'Name': u"Blake's Lotaburger"},
             {u'City': u'albuquerque', u'Count': 11, u'Name': u"McDonald's"},
             {u'City': u'albuquerque', u'Count': 9, u'Name': u"Wendy's"},
             {u'City': u'albuquerque', u'Count': 8, u'Name': u'Sonic'},
             {u'City': u'albuquerque', u'Count': 7, u'Name': u'Burger King'}]}

## What are the most popular shops.
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                                          "city_name":city}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Shop":"$shop"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Shop":"$_id.Shop",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":10}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 50, u'Shop': u'supermarket'},
             {u'City': u'honolulu', u'Count': 24, u'Shop': u'convenience'},
             {u'City': u'honolulu', u'Count': 18, u'Shop': u'clothes'},
             {u'City': u'honolulu', u'Count': 7, u'Shop': u'mall'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'hairdresser'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'bakery'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'hifi'},
             {u'City': u'honolulu', u'Count': 4, u'Shop': u'jewelry'},
             {u'City': u'honolulu', u'Count': 3, u'Shop': u'sports'},
             {u'City': u'honolulu', u'Count': 3, u'Shop': u'garden_centre'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 66, u'Shop': u'convenience'},
             {u'City': u'albuquerque', u'Count': 46, u'Shop': u'supermarket'},
             {u'City': u'albuquerque', u'Count': 22, u'Shop': u'car_repair'},
             {u'City': u'albuquerque', u'Count': 18, u'Shop': u'clothes'},
             {u'City': u'albuquerque', u'Count': 15, u'Shop': u'hairdresser'},
             {u'City': u'albuquerque', u'Count': 14, u'Shop': u'car'},
             {u'City': u'albuquerque',
              u'Count': 12,
              u'Shop': u'department_store'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'furniture'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'bicycle'},
             {u'City': u'albuquerque', u'Count': 12, u'Shop': u'mall'}]}

## Find the names of the most popular supermarkets
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                           "city_name":city,
                           "shop":"supermarket"}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Supermarket":"$name"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Supermarket":"$_id.Supermarket",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":5}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 4, u'Supermarket': u'Foodland'},
             {u'City': u'honolulu', u'Count': 2, u'Supermarket': u'Ross'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Don Quijote'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Safeway 1263'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Supermarket': u'Malama Market'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque',
              u'Count': 6,
              u'Supermarket': u"Smith's"},
             {u'City': u'albuquerque', u'Count': 5},
             {u'City': u'albuquerque',
              u'Count': 4,
              u'Supermarket': u"Albertson's"},
             {u'City': u'albuquerque',
              u'Count': 2,
              u'Supermarket': u"Trader Joe's"},
             {u'City': u'albuquerque',
              u'Count': 2,
              u'Supermarket': u'Walmart'}]}

## Find the names of the most popular convenience stores
def make_city_pipeline(city):
    pipeline = [{"$match":{"shop":{"$exists":1},
                           "city_name":city,
                           "shop":"convenience"}},
                       {"$group":{"_id":{"City":"$city_name",
                                  "Name":"$name"},
                           "count":{"$sum":1}}},
                       {"$project": {'_id':0,
                                     "City":"$_id.City",
                                     "Name":"$_id.Name",
                                     "Count":"$count"}},
                       {"$sort":{"Count":-1}},
                       {"$limit":5}]
    return pipeline

pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)

{u'ok': 1.0,
 u'result': [{u'City': u'honolulu', u'Count': 7, u'Name': u'ABC Store'},
             {u'City': u'honolulu', u'Count': 2, u'Name': u'AT&T Store'},
             {u'City': u'honolulu', u'Count': 1, u'Name': u'7-Eleven'},
             {u'City': u'honolulu',
              u'Count': 1,
              u'Name': u'AT&T Store Kapahulu'},
             {u'City': u'honolulu', u'Count': 1, u'Name': u'7-ELEVEN    76'}]}

{u'ok': 1.0,
 u'result': [{u'City': u'albuquerque', u'Count': 14, u'Name': u'Circle K'},
             {u'City': u'albuquerque', u'Count': 14},
             {u'City': u'albuquerque', u'Count': 12, u'Name': u'7-11'},
             {u'City': u'albuquerque', u'Count': 5, u'Name': u'Giant'},
             {u'City': u'albuquerque', u'Count': 2, u'Name': u"Bubba's"}]}

Back to top