-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
114 lines (83 loc) · 3.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#Display message
print ("Importing Libraries...")
#Import libraries
import sys
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
#Import libraries for performing NLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
#Display message
print ("All libraries imported...")
#Display message
print ("Reading the dataset...")
#Read CSV
messages = pd.read_csv('dataset/SMSSpamCollection', sep='\t', names=['Label', 'Message'])
#Display message
print ("Dataset read successfully...")
#Text Processing
#Tokenization
#No stemming
def processText(message) :
"""
This function performs tokenization of input string
1. Remove punctuations
2. Remove stopwords
3. Returns the list of clean text words
"""
noPunctuation = [char for char in message if char not in string.punctuation]
noPunctuation = ''.join(noPunctuation) #Join all the character in character array returned in the previous statament
tokenizedMessage = [word for word in noPunctuation.split() if word.lower() not in stopwords.words('english')]
return tokenizedMessage
#Split dataset into train and test set
from sklearn.model_selection import train_test_split
msg_train, msg_test, label_train, label_test = train_test_split(messages['Message'], messages['Label'], test_size=0.2)
#Display message
print ("Started creating pipeling...")
#TRAINING
#Create Pipeline
#Both training and testing data will pass through our created pipeline sequentially
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=processText)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MLPClassifier()) # train on TF-IDF vectors with MLP classifier
])
#Display message
print ("Pipeline created successfully...")
#Display message
print ("Fitting the pipeline...")
#Fitting using pipeline
pipeline.fit(msg_train, label_train)
#Display message
print ("Pipeline fitted successfully...")
#Display message
print ("Predicting the results...")
argumentsList = sys.argv
if len(argumentsList) == 1 :
#TESTING
#Predicting using pipeline
predictions = pipeline.predict(msg_test)
#Display message
print ("Results predicted...")
#Display message
print ("Displaying results...")
print ("Confusion Matrix :")
#PRINTING RESULTS
print(confusion_matrix(label_test, predictions))
else :
# Extract the message
message = " ".join(sys.argv[1:])
#TESTING
#Predicting using pipeline
prediction = pipeline.predict(pd.Series(message))
#Display message
print ("Results predicted...")
#Display message
print ("Displaying results...\n")
print ("Input Message is a ", prediction)