-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataPreprocess.py
115 lines (95 loc) · 4.66 KB
/
DataPreprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
train = pd.read_csv('Kaggle/train.csv')
test = pd.read_csv('Kaggle/test.csv')
train_data = train.copy()
test_data = test.copy()
train_data = train_data.set_index('id', drop=True)
print("Initial shape train:", train_data.shape)
print("Initial shape test:", test_data.shape)
# fill missing values of author, title
train_data[['title', 'author']] = train_data[['title', 'author']].fillna(value='Missing')
# drop columns with missing values so now all the rows have values
train_data = train_data.dropna()
# Made a new column for the length
length = []
[length.append(len(str(text))) for text in train_data['text']]
train_data['length'] = length
min(train_data['length']), max(train_data['length']), round(sum(train_data['length']) / len(train_data['length']))
# Dropping text values with length less than 50
train_data = train_data.drop(train_data['text'][train_data['length'] < 50].index, axis=0)
# find capital letters in the title
# title with capital letters can be clickbait
capital_letters = [sum(1 for letter in title if letter.isupper()) / len(title) for title in train_data['title']]
train_data['capital letters'] = capital_letters
# dict with author as key, and array of labels as value
# the negative here is that sometimes there are multiple authors and we havent seperated them
# also, if we look at the Missing authors, we can see that we have many fake news there.
# d_author_label = {}
# for i in train_data['author']:
# d_author_label[i] = [train_data['label'][j] for j in train_data[train_data['author'] == i].index]
# # print(d_author_label)
# calculating the percentage of fake news per author and storing it in a new dict
# fake_percentage = []
# d_authors_percent = {}
# sum_of_articles_per_author = 0
#
# for key in d_author_label.keys():
# fake_articles_per_author = 0
# # get list of labels for each author
# list_of_values = d_author_label.get(key)
# sum_of_articles_per_author = len(list_of_values)
#
# for j in range(len(list_of_values)):
# # if the article is fake news
# if list_of_values[j] == 1:
# fake_articles_per_author = fake_articles_per_author + 1
# fake_percentage = (float(fake_articles_per_author) / float(sum_of_articles_per_author)) * 100
# d_authors_percent.update([(key, fake_percentage)])
train_data.fillna('')
print(min(train_data['length']), max(train_data['length']),
round(sum(train_data['length']) / len(train_data['length'])))
max_features = 4500 # from the calculation above
# Tokenizing the text - converting the words, letters into counts or numbers.
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
# lower=True means we convert to lowercase
# we keep maximum 4500 words
tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tokenizer.fit_on_texts(texts=train_data['text'])
word_index = tokenizer.word_index
vocab_size = len(word_index)
print("vocab size", vocab_size)
X = tokenizer.texts_to_sequences(texts=train_data['text'])
# now applying padding to make them even shaped.
# This function transforms a list of num_samples sequences (lists of integers) into a 2D Numpy array of shape
# (num_samples, num_timesteps). In our case 20554x4500
# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
X = pad_sequences(sequences=X, maxlen=max_features, padding='pre')
embeddings_index = {}
with open('Kaggle/glove.6B.100d.txt') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
print(len(word_index))
print("Found %s word vectors!" % len(embeddings_index))
embeddings_matrix = np.zeros((vocab_size + 1, 100))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embeddings_matrix[i] = embedding_vector
# len of y is 20554
y = train_data['label'].values # target
# Preparing test dataset
test_data = test_data.set_index('id', drop=True)
test_data = test_data.fillna(' ') # fill missing values
print(test_data.shape)
tokenizer.fit_on_texts(texts=test_data['text'])
test_text = tokenizer.texts_to_sequences(texts=test_data['text'])
test_text = pad_sequences(sequences=test_text, maxlen=max_features, padding='pre')
test_text.tofile("Kaggle/testtext.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)