forked from geniusai-research/email-summarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
email_summarization.py
115 lines (100 loc) · 3.59 KB
/
email_summarization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module for E-mail Summarization
*****************************************************************************
Input Parameters:
emails: A list of strings containing the emails
Returns:
summary: A list of strings containing the summaries.
*****************************************************************************
"""
# ***************************************************************************
import numpy as np
from talon.signature.bruteforce import extract_signature
from langdetect import detect
from nltk.tokenize import sent_tokenize
import skipthoughts
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
# ***************************************************************************
def preprocess(emails):
"""
Performs preprocessing operations such as:
1. Removing signature lines (only English emails are supported)
2. Removing new line characters.
"""
n_emails = len(emails)
for i in range(n_emails):
email = emails[i]
email, _ = extract_signature(email)
lines = email.split('\n')
for j in reversed(range(len(lines))):
lines[j] = lines[j].strip()
if lines[j] == '':
lines.pop(j)
emails[i] = ' '.join(lines)
def split_sentences(emails):
"""
Splits the emails into individual sentences
"""
n_emails = len(emails)
for i in range(n_emails):
email = emails[i]
sentences = sent_tokenize(email)
for j in reversed(range(len(sentences))):
sent = sentences[j]
sentences[j] = sent.strip()
if sent == '':
sentences.pop(j)
emails[i] = sentences
def skipthought_encode(emails):
"""
Obtains sentence embeddings for each sentence in the emails
"""
enc_emails = [None]*len(emails)
cum_sum_sentences = [0]
sent_count = 0
for email in emails:
sent_count += len(email)
cum_sum_sentences.append(sent_count)
all_sentences = [sent for email in emails for sent in email]
print('Loading pre-trained models...')
model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
print('Encoding sentences...')
enc_sentences = encoder.encode(all_sentences, verbose=False)
for i in range(len(emails)):
begin = cum_sum_sentences[i]
end = cum_sum_sentences[i+1]
enc_emails[i] = enc_sentences[begin:end]
return enc_emails
def summarize(emails):
"""
Performs summarization of emails
"""
n_emails = len(emails)
summary = [None]*n_emails
print('Preprecesing...')
preprocess(emails)
print('Splitting into sentences...')
split_sentences(emails)
print('Starting to encode...')
enc_emails = skipthought_encode(emails)
print('Encoding Finished')
for i in range(n_emails):
enc_email = enc_emails[i]
n_clusters = int(np.ceil(len(enc_email)**0.5))
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(enc_email)
avg = []
closest = []
for j in range(n_clusters):
idx = np.where(kmeans.labels_ == j)[0]
avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
enc_email)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary[i] = ' '.join([emails[i][closest[idx]] for idx in ordering])
print('Clustering Finished')
return summary