Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Udacity ud120 #299

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
enron_mail_20110402.tgz
enron_mail_20110402/
enron_mail_20150507.tgz
enron_mail_20150507.tar.gz
enron_mail_20150507.tar
maildir/
text_learning/your_word_data.pkl
text_learning/your_email_authors.pkl
my_classifier.pkl
my_dataset.pkl
my_feature_list.pkl
.idea

Binary file added Project report.docx
Binary file not shown.
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,34 @@
ud120-projects
<h1> ud120-projects </h1>
==============

Starter project code for students taking Udacity ud120
My repo for Udacity ud120 course

<h2> Content </h2>
* Session excercises / mini projects
* Enron project


<h2> IDE </h2>
PyCharm community Edition By Jet Brain

<h2> Commands used </h2>
**install sklearn**

pip install scikit-learn

**install natural language toolkit**

pip install nltk

**install matplotlib**

pip install matplotlib

<h2> Environment from requirements.txt</h2>

nltk==3.2.1<br>
numpy==1.13.3<br>
scikit-learn==0.18<br>
scipy==0.19.1<br>


2 changes: 1 addition & 1 deletion choose_your_own/class_vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def output_image(name, format, bytes):
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
print image_start+json.dumps(data)+image_end
print( image_start+json.dumps(data)+image_end)

Binary file added choose_your_own/test.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 10 additions & 1 deletion choose_your_own/your_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,16 @@

### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

# KNN
clf_knn = KNeighborsClassifier(n_neighbors=4)
clf_knn.fit(features_train, labels_train)
pred_knn = clf_knn.predict(features_test)
print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn))

clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6)
clf_rf.fit(features_train, labels_train)
clf_rf = clf_rf.predict(features_test)
print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf))



Expand Down
43 changes: 42 additions & 1 deletion datasets_questions/explore_enron_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,48 @@
"""

import pickle
import numpy as np

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))


print(len(enron_data))
print(len(list(enron_data.values())[0]))

count = 0
for person_name in enron_data.keys():
if(enron_data[person_name]["poi"]==1):
count = count+1
print(count)

total_poi = 0
with open('../final_project/poi_names.txt', 'r') as file:
for line in file:
if('\(y\)' or '\(n\)' in line):
total_poi= total_poi+1
print(total_poi)
file.close()
print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value'])
print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options'])

most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")])
print(most_value_taken)

salaries_not_nan = 0
known_emails = 0
total_payments_not_nan = 0
total_payments_not_nan_poi = 0
for person_name in enron_data:
if not np.isnan(float(enron_data[person_name]['salary'])):
salaries_not_nan += 1
if(enron_data[person_name]['email_address'] != 'NaN'):
known_emails+=1
if np.isnan(float(enron_data[person_name]['total_payments'])):
total_payments_not_nan +=1
if np.isnan(enron_data[person_name]["poi"]==1 ):
total_payments_not_nan_poi += 1

print('Salaries available:: ', salaries_not_nan)
print('Available emails: ', known_emails)
print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data))
print('Number and Percentage Pois NaN -> their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count)
16 changes: 16 additions & 0 deletions decision_tree/dt_author_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,23 @@

#########################################################
### your code goes here ###
#imports
from sklearn import tree
from sklearn.metrics import accuracy_score
#
# create classifer
clf = tree.DecisionTreeClassifier(min_samples_split=40)

# fit the classifier on training features and labels
clf.fit(features_train, labels_train)

#predict
pred = clf.predict(features_test)

# print
print( "Accuracy:", accuracy_score(labels_test, pred))

print( "No of features in date:", len(features_train[0]))

#########################################################

Expand Down
60 changes: 60 additions & 0 deletions evaluation/evaluate_poi_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@

import pickle
import sys
import numpy as np
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

Expand All @@ -27,5 +31,61 @@


### your code goes here
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30,
random_state=42)
# create DT Classifier
clf = DecisionTreeClassifier()

# fit/train it
clf.fit(features_train, labels_train)

# predict
pred = clf.predict(features_test)

#print
print( "accuracy:", accuracy_score(labels_test, pred))

### evaluation
values, counts = np.unique(pred, return_counts=True)
test_size = len(features_test)

# print
print("Predicted POIs:", zip(values, counts))
print( "Total number in test set:", test_size)
print( "Accuracy - all poi=0:", counts[0] / test_size)

true_positives = 0
for actual, predicted in zip(labels_test, pred):
if actual == 1 and predicted == 1:
true_positives += 1

# print
print( "TP - true positives:", true_positives)
print( "Precision score:", precision_score(labels_test, pred))
print( "Recall score:", recall_score(labels_test, pred))

prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]


def calc_precision_and_recall(actual, predicted):
print( "Doing precision and recall...")
true_positives = 0
false_positives = 0
false_negatives = 0
true_negatives = 0
for a, p in zip(actual, predicted):
if a == 1 and p == 1:
true_positives += 1
elif a == 1 and p == 0:
false_negatives += 1
elif a == 0 and p == 1:
false_positives += 1
else:
true_negatives += 1
print( "Precision:", true_positives / (true_positives + false_positives))
print( "Recall:", true_positives / (true_positives + false_negatives))


calc_precision_and_recall(true_labels, prediction_labels)

17 changes: 16 additions & 1 deletion feature_selection/find_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

# get words
words = vectorizer.get_feature_names()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
Expand All @@ -38,6 +40,19 @@


### your code goes here

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print( "Accuracy:", accuracy_score(labels_test, pred))

print( "Important features:")
for index, feature in enumerate(clf.feature_importances_):
if feature>0.2:
print( "Feature number", index)
print( "Importance", feature)
print( "Word", words[index])


Loading