-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeansThreshold.py
93 lines (73 loc) · 3.37 KB
/
kmeansThreshold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import math
import operator
import time
from Instance import Instance
from readCsv import readCsv
# Calcula o threshold (fronteira)
def calculateThreshold(instances, minArg, maxArg):
threshold = 0
centroid = Instance(-1, [], instances[0].classification)
for instance in instances:
for i, param in enumerate(instance.params):
if (len(centroid.params) <= i):
centroid.params.append(param)
else:
centroid.params[i] += param
for i, elem in enumerate(centroid.params):
centroid.params[i] = centroid.params[i] / len(instances)
for instance in instances:
distance = centroid.euclideanDistance(instance, minArg, maxArg)
threshold += distance
threshold = threshold / len(instances)
return threshold, centroid
# Calcula os vizinhos mais próximos de cada uma das instancias do conjunto de dados
def knnClassif(classes, centroid, minArg, maxArg, instance, k, threshold):
distanceToInstance = instance.euclideanDistance(centroid, minArg, maxArg)
instance.insertDistance(distanceToInstance, centroid, k)
return instance.classify(k, classes, threshold)
def measureAccuracy(classes, centroid, tests, minArg, maxArg, k, threshold):
size = len(tests) # Tamanho do conjunto de dados
trueNegative = 0 # Número de verdadeiros negativos
truePositive = 0 # Número verdadeiros positivos
falseNegative = 0 # Número de falsos negativos
falsePositive = 0 # Número de falsos positivos
# calcula todas as distancias para i-esima instancia do conjunto de teste e classifica de acordo com os k vizinhos mais proximos
for i, testInstance in enumerate(tests):
knnClassification = knnClassif(classes, centroid, minArg, maxArg, testInstance, k, threshold)
# Conta o numero de instancias, negativos verdadeiros, positivos verdadeiros, falso negativos e falsos positivos do dataset
if knnClassification == testInstance.classification:
if knnClassification == "true":
trueNegative += 1
else:
truePositive += 1
else:
if knnClassification == "true":
falseNegative += 1
else:
falsePositive += 1
# calcula os parametros de medida de acurácia
precision = truePositive / (truePositive + falsePositive)
recall = truePositive / (truePositive + falseNegative)
f_measure = (2 * truePositive) / ((2 * truePositive) + falsePositive + falseNegative)
print("Recall: %.2f%%" % (recall * 100))
print("Precision: %.2f%%" % (precision * 100))
print("F1-measure: %.2f%%\n" % (f_measure * 100))
def kmeansThreshold():
classes = []
instances = []
minArg = []
maxArg = []
kValues = [1]
oneClassDatasets = ["./oneClassDatasets/JM1_software_defect_prediction.csv", "./oneClassDatasets/PC1_software_defect_prediction.csv"]
testDatasets = ["./testDatasets/testJM1_software_defect_prediction.csv", "./testDatasets/testPC1_software_defect_prediction.csv"]
for i, trainDataset in enumerate(oneClassDatasets):
datasetName = trainDataset.split("/")
print("\nDataset: " + datasetName[2] + "\n")
startTime = time.time()
classes, instances, minArg, maxArg = readCsv(trainDataset)
_, testInstances, _, _ = readCsv(testDatasets[i])
threshold, centroid = calculateThreshold(instances, minArg, maxArg)
measureAccuracy(classes, centroid, testInstances, minArg, maxArg, 1, threshold)
print("Tempo gasto: " + str(time.time() - startTime))
print("------------------------------------------------------------------------------------")
kmeansThreshold()