__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


from pprint import PrettyPrinter
from pymongo import MongoClient
import pandas as pd


# Instantiate a PrettyPrinter, and assign it to the variable pp
pp = PrettyPrinter(indent=2)


# Create a client that connects to the database running at localhost on port 27017
client = MongoClient(host="localhost", port=27017)


# Get the names of the databases on this client
client.list_databases()

<pymongo.command_cursor.CommandCursor at 0x7fec6d134ee0>


# Give a task of printing out all the integers between 0 and 4, inclusive
from sys import getsizeof
my_list = [0, 1, 2, 3, 4] # Create a list (my_list)

# Create with a for loop and I need something to look through 
for i in my_list:
    print(i) # Print the integers


# Create a range 
from sys import getsizeof
my_list = [0, 1, 2, 3, 4]
my_range = range(0,5) # Iterator 

for i in my_range:
    print(i)


# Look at the size of my_list
getsizeof(my_list)

120


# Look at the size of my_range
getsizeof(my_range)

48


from sys import getsizeof
my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8]
my_range = range(0,9)

for i in my_list:
    print(i)


getsizeof(my_list)

152


getsizeof(my_range)

48


from sys import getsizeof
my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8]
my_range = range(0,8_000_000)


getsizeof(my_range)

48


# Generate everything in iterator and turn it in to a list
list(client.list_databases())

[{'name': 'admin', 'sizeOnDisk': 40960, 'empty': False},
 {'name': 'air-quality', 'sizeOnDisk': 6938624, 'empty': False},
 {'name': 'config', 'sizeOnDisk': 94208, 'empty': False},
 {'name': 'local', 'sizeOnDisk': 73728, 'empty': False}]


pp.pprint(list(client.list_databases()))

[ {'empty': False, 'name': 'admin', 'sizeOnDisk': 40960},
  {'empty': False, 'name': 'air-quality', 'sizeOnDisk': 6938624},
  {'empty': False, 'name': 'config', 'sizeOnDisk': 94208},
  {'empty': False, 'name': 'local', 'sizeOnDisk': 73728}]


# Assign the "air-quality" database to the variable db
db = client["air-quality"]


# Use the list_collections method to print a list of the collections available in db
db.list_collections()

<pymongo.command_cursor.CommandCursor at 0x7fec65899d60>


# Turn it in to a list and look at the first item 
list(db.list_collections())[0]

{'name': 'lagos',
 'type': 'timeseries',
 'options': {'timeseries': {'timeField': 'timestamp',
   'metaField': 'metadata',
   'granularity': 'seconds',
   'bucketMaxSpanSeconds': 3600}},
 'info': {'readOnly': False}}


# I just want the name so create a for loop
for c in db.list_collections(): # c is collection 
    print(c["name"]) # I just want the value associated with the "name" key & c is collection

lagos
system.buckets.lagos
dar-es-salaam
system.buckets.dar-es-salaam
nairobi
system.buckets.nairobi
system.views


# Assign the "nairobi" collection in db to the variable name nairobi
nairobi = ["nairobi"]


nairobi = db["nairobi"]


nairobi.count_documents({})

202212


# Use the find_one method to retrieve one document from the nairobi collection, and assign it to the variable name result
result = nairobi.find_one({})
pp.pprint(result)

{ 'P1': 39.67,
  '_id': ObjectId('62a1510e5e1eb913597171ee'),
  'metadata': { 'lat': -1.3,
                'lon': 36.785,
                'measurement': 'P1',
                'sensor_id': 57,
                'sensor_type': 'SDS011',
                'site': 29},
  'timestamp': datetime.datetime(2018, 9, 1, 0, 0, 2, 472000)}


type(nairobi['sensor_id'])

pymongo.collection.Collection


# Use the distinct method to determine how many sensor sites are included in the nairobi collection
nairobi.distinct("metadata.site")

[29, 6]


# Use the count_documents method to determine how many readings there are for each site in the nairobi collection
nairobi.count_documents({"metadata.site": 6})

70360


print("Documents from site 6:", nairobi.count_documents({"metadata.site": 6}))
print("Documents from site 29:", nairobi.count_documents({"metadata.site": 29}))

Documents from site 6: 70360
Documents from site 29: 131852


# Use the aggregate method to determine how many readings there are for each site in the nairobi collection
result = nairobi.aggregate(
    [
        {"$group": {"_id": "$metadata.site"}}
    ]
)
pp.pprint(list(result))

[{'_id': 29}, {'_id': 6}]


result = nairobi.aggregate(
    [
        {"$group": {"_id": "$metadata.site", "count": {"$count": {}}}}
    ]

)
pp.pprint(list(result))

[{'_id': 6, 'count': 70360}, {'_id': 29, 'count': 131852}]


# Use the distinct method to determine how many types of measurements have been taken in the nairobi collection
nairobi.distinct("metadata.measurement")

['P1', 'temperature', 'P2', 'humidity']


# Use the find method to retrieve the PM 2.5 readings from all sites
result = nairobi.find({"metadata.measurement": "P2"}).limit(3) # Limit results to 3 records only
pp.pprint(list(result))

[ { 'P2': 34.43,
    '_id': ObjectId('62a1510f5e1eb9135971f279'),
    'metadata': { 'lat': -1.3,
                  'lon': 36.785,
                  'measurement': 'P2',
                  'sensor_id': 57,
                  'sensor_type': 'SDS011',
                  'site': 29},
    'timestamp': datetime.datetime(2018, 9, 1, 0, 0, 2, 472000)},
  { 'P2': 30.53,
    '_id': ObjectId('62a1510f5e1eb9135971f27a'),
    'metadata': { 'lat': -1.3,
                  'lon': 36.785,
                  'measurement': 'P2',
                  'sensor_id': 57,
                  'sensor_type': 'SDS011',
                  'site': 29},
    'timestamp': datetime.datetime(2018, 9, 1, 0, 5, 3, 941000)},
  { 'P2': 22.8,
    '_id': ObjectId('62a1510f5e1eb9135971f27b'),
    'metadata': { 'lat': -1.3,
                  'lon': 36.785,
                  'measurement': 'P2',
                  'sensor_id': 57,
                  'sensor_type': 'SDS011',
                  'site': 29},
    'timestamp': datetime.datetime(2018, 9, 1, 0, 10, 4, 374000)}]


# Use the aggregate method to calculate how many readings there are for each type ("humidity", "temperature", "P2", and "P1") in site 6
result = nairobi.aggregate(
    [
        {"$group": {"_id": "$metadata.measurement", "count": {"$count": {}}}}
    ]
)
pp.pprint(list(result))

[ {'_id': 'P1', 'count': 51076},
  {'_id': 'P2', 'count': 51076},
  {'_id': 'temperature', 'count': 50030},
  {'_id': 'humidity', 'count': 50030}]


result = nairobi.aggregate(
    [
        {"$match": {"metadata.site": 6}},
        {"$group": {"_id": "$metadata.measurement", "count": {"$count": {}}}}
    ]
)
pp.pprint(list(result))

[ {'_id': 'P1', 'count': 18169},
  {'_id': 'temperature', 'count': 17011},
  {'_id': 'P2', 'count': 18169},
  {'_id': 'humidity', 'count': 17011}]


# Use the aggregate method to calculate how many readings there are for each type ("humidity", "temperature", "P2", and "P1") in site 29
result = nairobi.aggregate(
    [
        {"$match": {"metadata.site": 29}},
        {"$group": {"_id": "$metadata.measurement", "count": {"$count": {}}}}
    ]
)
pp.pprint(list(result))

[ {'_id': 'P1', 'count': 32907},
  {'_id': 'temperature', 'count': 33019},
  {'_id': 'P2', 'count': 32907},
  {'_id': 'humidity', 'count': 33019}]


# Use the find method to retrieve the PM 2.5 readings from site 29
result = nairobi.find(  # First thing: pass in a dicitonary with the criteria for my search
    {"metadata.site": 29, "metadata.measurement": "P2"} 
)
pp.pprint(result.next())

{ 'P2': 34.43,
  '_id': ObjectId('62a1510f5e1eb9135971f279'),
  'metadata': { 'lat': -1.3,
                'lon': 36.785,
                'measurement': 'P2',
                'sensor_id': 57,
                'sensor_type': 'SDS011',
                'site': 29},
  'timestamp': datetime.datetime(2018, 9, 1, 0, 0, 2, 472000)}


result = nairobi.find(
    {"metadata.site": 29, "metadata.measurement": "P2"}, 
    projection={"P2": 1, "timestamp": 1, "_id": 0} # Use the projection argument to limit the results to the "P2" and "timestamp" keys only
)
pp.pprint(result.next())

{'P2': 34.43, 'timestamp': datetime.datetime(2018, 9, 1, 0, 0, 2, 472000)}


# Read records from result into the DataFrame df
result = nairobi.find(
    {"metadata.site": 29, "metadata.measurement": "P2"},
    projection={"P2": 1, "timestamp": 1, "_id": 0} # Set the index to "timestamp"
)


df = pd.DataFrame(result)
df.head()


result = nairobi.find(
    {"metadata.site": 29, "metadata.measurement": "P2"},
    projection={"P2": 1, "timestamp": 1, "_id": 0} # Set the index to "timestamp"
)


df = pd.DataFrame(result).set_index("timestamp")
df.head()

	timestamp	P2
0	2018-09-01 00:00:02.472	34.43
1	2018-09-01 00:05:03.941	30.53
2	2018-09-01 00:10:04.374	22.80
3	2018-09-01 00:15:04.245	13.30
4	2018-09-01 00:20:04.869	16.57

	P2
timestamp
2018-09-01 00:00:02.472	34.43
2018-09-01 00:05:03.941	30.53
2018-09-01 00:10:04.374	22.80
2018-09-01 00:15:04.245	13.30
2018-09-01 00:20:04.869	16.57

Prepare Data¶

Connect¶

Explore¶

Import¶