-
Notifications
You must be signed in to change notification settings - Fork 191
/
apriori.py
150 lines (128 loc) · 5.1 KB
/
apriori.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from datetime import datetime
class Apriori():
def __init__(self, dataset):
self.dataset = dataset
self.support_data = {}
self.freq_itemsets = []
self.t_num = float(len(self.dataset))
def __create_C1(self):
"""
Create frequent candidate 1-itemset C1 by scaning data set.
Args:
data_set: A list of transactions. Each transaction contains several items.
Returns:
C1: A set which contains all frequent candidate 1-itemsets
"""
C1 = set()
for t in self.dataset:
for item in t:
item_set = frozenset([item])
C1.add(item_set)
return C1
def __is_apriori(self, Ck_item, Lksub1):
"""
Judge whether a frequent candidate k-itemset satisfy Apriori property.
Args:
Ck_item: a frequent candidate k-itemset in Ck which contains all frequent
candidate k-itemsets.
Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.
Returns:
True: satisfying Apriori property.
False: Not satisfying Apriori property.
"""
for item in Ck_item:
sub_Ck = Ck_item - frozenset([item])
if sub_Ck not in Lksub1:
return False
return True
def __create_Ck(self, Lksub1, k):
"""
Create Ck, a set which contains all all frequent candidate k-itemsets
by Lk-1's own connection operation.
Args:
Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.
k: the item number of a frequent itemset.
Return:
Ck: a set which contains all all frequent candidate k-itemsets.
"""
Ck = set()
len_Lksub1 = len(Lksub1)
list_Lksub1 = list(Lksub1)
for i in range(len_Lksub1):
for j in range(1, len_Lksub1):
l1 = list(list_Lksub1[i])
l2 = list(list_Lksub1[j])
l1.sort()
l2.sort()
if l1[0:k-2] == l2[0:k-2]:
Ck_item = list_Lksub1[i] | list_Lksub1[j]
# pruning
if self.__is_apriori(Ck_item, Lksub1):
Ck.add(Ck_item)
return Ck
def __generate_Lk_by_Ck(self, Ck, min_sup):
"""
Generate Lk by executing a delete policy from Ck.
Args:
data_set: A list of transactions. Each transaction con tains several items.
Ck: A set which contains all all frequent candidate k-itemsets.
min_sup: The minimum support.
support_data: A dictionary. The key is frequent itemset and the value is support.
Returns:
Lk: A set which contains all all frequent k-itemsets.
"""
Lk = set()
item_count = {}
for t in self.dataset:
for item in Ck:
if item.issubset(t):
if item not in item_count:
item_count[item] = 1
else:
item_count[item] += 1
for item in item_count:
if (item_count[item] / self.t_num) >= min_sup:
Lk.add(item)
self.support_data[item] = item_count[item] / self.t_num
return Lk
def generate_L(self, min_sup):
"""
Generate all frequent itemsets.
Args:
data_set: A list of transactions. Each transaction contains several items.
k: Maximum number of items for all frequent itemsets.
min_sup: The minimum support.
Returns:
L: The list of Lk.
support_data: A dictionary. The key is frequent itemset and the value is support.
"""
start = datetime.now()
C1 = self.__create_C1()
deltatime = datetime.now() - start
create_Ck_time = deltatime.seconds + deltatime.microseconds / 1000000
start = datetime.now()
L1 = self.__generate_Lk_by_Ck(C1, min_sup)
deltatime = datetime.now() - start
generate_Lk_time = deltatime.seconds + deltatime.microseconds / 1000000
Lksub1 = L1.copy()
for lk_i in Lksub1:
self.freq_itemsets.append((lk_i, self.support_data[lk_i]))
i = 2
while True:
start = datetime.now()
Ci = self.__create_Ck(Lksub1, i)
deltatime = datetime.now() - start
create_Ck_time += deltatime.seconds + deltatime.microseconds / 1000000
start = datetime.now()
Li = self.__generate_Lk_by_Ck(Ci, min_sup)
deltatime = datetime.now() - start
generate_Lk_time += deltatime.seconds + deltatime.microseconds / 1000000
Lksub1 = Li.copy()
if len(Lksub1) == 0:
break
for lk_i in Lksub1:
self.freq_itemsets.append((lk_i, self.support_data[lk_i]))
i += 1
print("Create Ck time (s): ", create_Ck_time)
print("Generate Lk time (s): ", generate_Lk_time)
return self.freq_itemsets