Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

commit id ce48019 change 'rb' to 'r' #256

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ text_learning/your_email_authors.pkl
my_classifier.pkl
my_dataset.pkl
my_feature_list.pkl
.ipynb_checkpoints/enron_data_ml-checkpoint.ipynb
enron_mail_20150507.tar.gz
Binary file added choose_your_own/test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
52 changes: 52 additions & 0 deletions choose_your_own/your_algorithm KNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/python

import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
import numpy as np

features_train, labels_train, features_test, labels_test = makeTerrainData()


### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.neighbors import KNeighborsClassifier
### get sqrt(n) and make sure its an odd num as k value
def kvalues():
if int(np.sqrt(len(features_train)))%2!=0:
k = int(np.sqrt(len(features_train)))
else:
k = int(np.sqrt(len(features_train)))+1
return k

clf= KNeighborsClassifier(n_neighbors= kvalues())
clf.fit(features_train, labels_train)
print(clf.score(features_test, labels_test))


try:
prettyPicture(clf, features_test, labels_test)
except NameError:
pass
43 changes: 43 additions & 0 deletions choose_your_own/your_algorithm Randomforest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/python

import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
import numpy as np

features_train, labels_train, features_test, labels_test = makeTerrainData()


### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################


### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(min_samples_split= 50)
clf.fit(features_train, labels_train)
print(clf.score(features_test, labels_test))

try:
prettyPicture(clf, features_test, labels_test)
except NameError:
pass
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
import numpy as np

features_train, labels_train, features_test, labels_test = makeTerrainData()

Expand All @@ -28,15 +29,14 @@
################################################################################


### your code here! name your classifier object clf if you want the
### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import AdaBoostClassifier
clf= AdaBoostClassifier(n_estimators= 100)
clf.fit(features_train, labels_train)






print(clf.score(features_test,labels_test))

try:
prettyPicture(clf, features_test, labels_test)
Expand Down
8 changes: 3 additions & 5 deletions datasets_questions/explore_enron_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/python

"""
"""
Starter code for exploring the Enron dataset (emails + finances);
loads up the dataset (pickled dict of dicts).

Expand All @@ -12,11 +12,9 @@
but here's an example to get you started:

enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000

"""

import pickle

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))


enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
20 changes: 14 additions & 6 deletions decision_tree/dt_author_id.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/python

"""
"""
This is the code to accompany the Lesson 3 (decision tree) mini-project.

Use a Decision Tree to identify emails from the Enron corpus by author:
Use a Decision Tree to identify emails from the Enron corpus by author:
Sara has label 0
Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
Expand All @@ -19,13 +19,21 @@
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#features_train = features_train[:len(features_train)/100]
#labels_train = labels_train[:len(labels_train)/100]


## train data row and features
print(features_train.shape)

#########################################################
### your code goes here ###
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_split= 40)

t0= time()
clf.fit(features_train,labels_train)
print("time to train:", round(time()-t0,3))

#########################################################

print(clf.score(features_test,labels_test))

#########################################################
Loading