-
Notifications
You must be signed in to change notification settings - Fork 0
/
LAB5_assignment1.py
101 lines (72 loc) · 2.48 KB
/
LAB5_assignment1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#
# TODO: Import whatever needs to be imported to make this work
#
# .. your code here ..
# Look Pretty
matplotlib.style.use('ggplot')
plt.style.use('ggplot')
#
# TODO: To procure the dataset, follow these steps:
# 1. Navigate to: https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2
# 2. In the 'Primary Type' column, click on the 'Menu' button next to the info button,
# and select 'Filter This Column'. It might take a second for the filter option to
# show up, since it has to load the entire list first.
# 3. Scroll down to 'GAMBLING'
# 4. Click the light blue 'Export' button next to the 'Filter' button, and select 'Download As CSV'
def doKMeans(df):
#
# INFO: Plot your data with a '.' marker, with 0.3 alpha at the Longitude,
# and Latitude locations in your dataset. Longitude = x, Latitude = y
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df.Longitude, df.Latitude, marker='.', alpha=0.3)
#
# TODO: Filter df so that you're only looking at Longitude and Latitude,
# since the remaining columns aren't really applicable for this purpose.
#
df_lim = df[['Longitude', 'Latitude']]
#
# TODO: Use K-Means to try and find seven cluster centers in this df.
# Be sure to name your kmeans model `model` so that the printing works.
#
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7)
model = kmeans.fit(df_lim)
#
# INFO: Print and plot the centroids...
centroids = model.cluster_centers_
ax.scatter(centroids[:,0], centroids[:,1], marker='x', c='red', alpha=0.5, linewidths=3, s=169)
print centroids
#
# TODO: Load your dataset after importing Pandas
#
import pandas as pd
df = pd.read_csv('E:/Python/Crimes.csv', sep=',', header=0)
df = pd.DataFrame(df)
#
# TODO: Drop any ROWs with nans in them
#
df = df.dropna(axis=0)
#
# TODO: Print out the dtypes of your dset
#
df.dtypes
#
# Coerce the 'Date' feature (which is currently a string object) into real date,
# and confirm by re-printing the dtypes. NOTE: This is a slow process...
#
df.Date = pd.to_datetime(df.Date, errors='coerce')
df.dtypes
# INFO: Print & Plot your data
doKMeans(df)
#
# TODO: Filter out the data so that it only contains samples that have
# a Date > '2011-01-01', using indexing. Then, in a new figure, plot the
# crime incidents, as well as a new K-Means run's centroids.
#
df_dt = df[(df.Date > '2011-01-01')]
df_dt['Date'].unique()
# INFO: Print & Plot your data
doKMeans(df_dt)
plt.show()