diff --git a/keras_retinanet/bin/evaluate.py b/keras_retinanet/bin/evaluate.py index 53695aea9..ab08c57c9 100755 --- a/keras_retinanet/bin/evaluate.py +++ b/keras_retinanet/bin/evaluate.py @@ -154,7 +154,7 @@ def main(args=None): from ..utils.coco_eval import evaluate_coco evaluate_coco(generator, model, args.score_threshold) else: - average_precisions, inference_time = evaluate( + average_precisions, inference_time, f1_scores = evaluate( generator, model, iou_threshold=args.iou_threshold, @@ -166,11 +166,19 @@ def main(args=None): # print evaluation total_instances = [] precisions = [] + scores = [] + for label, (average_precision, num_annotations) in average_precisions.items(): print('{:.0f} instances of class'.format(num_annotations), generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision)) total_instances.append(num_annotations) precisions.append(average_precision) + + for label, (f1_score, num_annotations) in f1_scores.items(): + print('{:.0f} instances of class'.format(num_annotations), + generator.label_to_name(label), 'with F1 score: {:.4f}'.format(f1_score)) + # total_instances.append(num_annotations) + scores.append(f1_score) if sum(total_instances) == 0: print('No test instances found.') @@ -180,6 +188,10 @@ def main(args=None): print('mAP using the weighted average of precisions among classes: {:.4f}'.format(sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances))) print('mAP: {:.4f}'.format(sum(precisions) / sum(x > 0 for x in total_instances))) + + print('mF1 using the weighted F1 scores among classes: {:.4f}'.format(sum([a * b for a, b in zip(total_instances, scores)]) / sum(total_instances))) + print('mF1: {:.4f}'.format(sum(scores) / sum(x > 0 for x in total_instances))) + if __name__ == '__main__': diff --git a/keras_retinanet/callbacks/eval.py b/keras_retinanet/callbacks/eval.py index abdc8bbc0..d442bc13b 100644 --- a/keras_retinanet/callbacks/eval.py +++ b/keras_retinanet/callbacks/eval.py @@ -60,7 +60,7 @@ def on_epoch_end(self, epoch, logs=None): logs = logs or {} # run evaluation - average_precisions, _ = evaluate( + average_precisions, _, f1_scores = evaluate( self.generator, self.model, iou_threshold=self.iou_threshold, @@ -72,6 +72,8 @@ def on_epoch_end(self, epoch, logs=None): # compute per class average precision total_instances = [] precisions = [] + scores = [] + for label, (average_precision, num_annotations) in average_precisions.items(): if self.verbose == 1: print('{:.0f} instances of class'.format(num_annotations), @@ -83,16 +85,34 @@ def on_epoch_end(self, epoch, logs=None): else: self.mean_ap = sum(precisions) / sum(x > 0 for x in total_instances) + # compute per class F1 score + for label, (f1_score, num_annotations) in f1_scores.items(): + if self.verbose == 1: + print('{:.0f} instances of class'.format(num_annotations), + self.generator.label_to_name(label), ' with F1 score: {:.4f}'.format(f1_score)) + total_instances.append(num_annotations) + scores.append(f1_score) + + if self.weighted_average: + self.mean_f1_score = sum([a * b for a, b in zip(total_instances, scores)]) / sum(total_instances) + else: + self.mean_f1_score = sum(scores) / sum(x > 0 for x in total_instances) + if self.tensorboard: import tensorflow as tf if tf.version.VERSION < '2.0.0' and self.tensorboard.writer: summary = tf.Summary() - summary_value = summary.value.add() - summary_value.simple_value = self.mean_ap - summary_value.tag = "mAP" + summary_value_map = summary.value.add() + summary_value_map.simple_value = self.mean_ap + summary_value_map.tag = "mAP" + summary_value_f1 = summary.value.add() + summary_value_f1.simple_value = self.mean_f1_score + summary_value_f1.tag = "mF1" self.tensorboard.writer.add_summary(summary, epoch) logs['mAP'] = self.mean_ap + logs['mF1'] = self.mean_f1_score if self.verbose == 1: - print('mAP: {:.4f}'.format(self.mean_ap)) + print('mAP: {:.4f}\nmF1: {:.4f}'.format( + self.mean_ap, self.mean_f1_score)) diff --git a/keras_retinanet/utils/eval.py b/keras_retinanet/utils/eval.py index da411b0d1..201e56c12 100644 --- a/keras_retinanet/utils/eval.py +++ b/keras_retinanet/utils/eval.py @@ -56,6 +56,28 @@ def _compute_ap(recall, precision): return ap +def _compute_f1(recall, precision): + """ + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The F1 score + """ + + # to calculate area under PR curve, look for points + # where X axis (recall) and Y axis(precision) have + # higher and identical values + # i = np.where((precision==precision.max()) & (recall==recall.max()) & (recall==precision))[0] + + # and multiply precision and recall, sum precision + # and recall, divide each other and multiply for two + # f1 = np.sum(2 * (precision[i] * recall[i]) / (precision[i] + recall[i]), axis=0) + + f1 = 2*((precision.max()*recall.max())/(precision.max()+recall.max())) + return f1 # f1.max() + + def _get_detections(generator, model, score_threshold=0.05, max_detections=100, save_path=None): """ Get the detections from the model using the generator. @@ -174,6 +196,7 @@ def evaluate( all_detections, all_inferences = _get_detections(generator, model, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) all_annotations = _get_annotations(generator) average_precisions = {} + f1_scores = {} # all_detections = pickle.load(open('all_detections.pkl', 'rb')) # all_annotations = pickle.load(open('all_annotations.pkl', 'rb')) @@ -237,8 +260,12 @@ def evaluate( # compute average precision average_precision = _compute_ap(recall, precision) average_precisions[label] = average_precision, num_annotations + + # compute F1 scores + f1_score = _compute_f1(recall, precision) + f1_scores[label] = f1_score, num_annotations # inference time inference_time = np.sum(all_inferences) / generator.size() - return average_precisions, inference_time + return average_precisions, inference_time, f1_scores