udacity · victorlifan · Mar 22, 2020 · Mar 22, 2020 · Mar 22, 2020 · Mar 22, 2020
@@ -8,3 +8,5 @@ text_learning/your_email_authors.pkl
 my_classifier.pkl
 my_dataset.pkl
 my_feature_list.pkl
+.ipynb_checkpoints/enron_data_ml-checkpoint.ipynb
+enron_mail_20150507.tar.gz
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+
+import matplotlib.pyplot as plt
+from prep_terrain_data import makeTerrainData
+from class_vis import prettyPicture
+import numpy as np
+
+features_train, labels_train, features_test, labels_test = makeTerrainData()
+
+
+### the training data (features_train, labels_train) have both "fast" and "slow"
+### points mixed together--separate them so we can give them different colors
+### in the scatterplot and identify them visually
+grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
+bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
+grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
+bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
+
+
+#### initial visualization
+plt.xlim(0.0, 1.0)
+plt.ylim(0.0, 1.0)
+plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
+plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
+plt.legend()
+plt.xlabel("bumpiness")
+plt.ylabel("grade")
+plt.show()
+################################################################################
+
+
+### your code here!  name your classifier object clf if you want the
+### visualization code (prettyPicture) to show you the decision boundary
+
+from sklearn.neighbors import KNeighborsClassifier
+### get sqrt(n) and make sure its an odd num as k value
+def kvalues():
+    if int(np.sqrt(len(features_train)))%2!=0:
+        k = int(np.sqrt(len(features_train)))
+    else:
+        k = int(np.sqrt(len(features_train)))+1
+    return k
+
+clf= KNeighborsClassifier(n_neighbors= kvalues())
+clf.fit(features_train, labels_train)
+print(clf.score(features_test, labels_test))
+
+
+try:
+    prettyPicture(clf, features_test, labels_test)
+except NameError:
+    pass
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+
+import matplotlib.pyplot as plt
+from prep_terrain_data import makeTerrainData
+from class_vis import prettyPicture
+import numpy as np
+
+features_train, labels_train, features_test, labels_test = makeTerrainData()
+
+
+### the training data (features_train, labels_train) have both "fast" and "slow"
+### points mixed together--separate them so we can give them different colors
+### in the scatterplot and identify them visually
+grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
+bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
+grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
+bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
+
+
+#### initial visualization
+plt.xlim(0.0, 1.0)
+plt.ylim(0.0, 1.0)
+plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
+plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
+plt.legend()
+plt.xlabel("bumpiness")
+plt.ylabel("grade")
+plt.show()
+################################################################################
+
+
+### your code here!  name your classifier object clf if you want the
+### visualization code (prettyPicture) to show you the decision boundary
+
+from sklearn.ensemble import RandomForestClassifier
+clf = RandomForestClassifier(min_samples_split= 50)
+clf.fit(features_train, labels_train)
+print(clf.score(features_test, labels_test))
+
+try:
+    prettyPicture(clf, features_test, labels_test)
+except NameError:
+    pass
@@ -3,6 +3,7 @@
 import matplotlib.pyplot as plt
 from prep_terrain_data import makeTerrainData
 from class_vis import prettyPicture
+import numpy as np
 
 features_train, labels_train, features_test, labels_test = makeTerrainData()
 
@@ -28,15 +29,14 @@
 ################################################################################
 
 
-### your code here!  name your classifier object clf if you want the 
+### your code here!  name your classifier object clf if you want the
 ### visualization code (prettyPicture) to show you the decision boundary
 
+from sklearn.ensemble import AdaBoostClassifier
+clf= AdaBoostClassifier(n_estimators= 100)
+clf.fit(features_train, labels_train)
 
-
-
-
-
-
+print(clf.score(features_test,labels_test))
 
 try:
     prettyPicture(clf, features_test, labels_test)

@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-""" 
+"""
     Starter code for exploring the Enron dataset (emails + finances);
     loads up the dataset (pickled dict of dicts).
 
@@ -12,11 +12,9 @@
     but here's an example to get you started:
 
     enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
-    
+
 """
 
 import pickle
 
-enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))
-
-
+enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
@@ -1,13 +1,13 @@
 #!/usr/bin/python
 
-""" 
+"""
     This is the code to accompany the Lesson 3 (decision tree) mini-project.
 
-    Use a Decision Tree to identify emails from the Enron corpus by author:    
+    Use a Decision Tree to identify emails from the Enron corpus by author:
     Sara has label 0
     Chris has label 1
 """
-    
+
 import sys
 from time import time
 sys.path.append("../tools/")
@@ -19,13 +19,21 @@
 ### labels_train and labels_test are the corresponding item labels
 features_train, features_test, labels_train, labels_test = preprocess()
 
+#features_train = features_train[:len(features_train)/100]
+#labels_train = labels_train[:len(labels_train)/100]
 
-
+## train data row and features
+print(features_train.shape)
 
 #########################################################
 ### your code goes here ###
+from sklearn.tree  import DecisionTreeClassifier
+clf = DecisionTreeClassifier(min_samples_split= 40)
 
+t0= time()
+clf.fit(features_train,labels_train)
+print("time to train:", round(time()-t0,3))
 
-#########################################################
-
+print(clf.score(features_test,labels_test))
 
+#########################################################