From 792d06108c58f3f38a9bb1328851b6d3b23de35e Mon Sep 17 00:00:00 2001
From: david <david@e0a8ed71-7df4-0310-8962-fdc924857419>
Date: Wed, 18 Jan 2012 00:38:51 +0000
Subject: [PATCH] Block comment on novelty detection.

---
 FPEngine.cc | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/FPEngine.cc b/FPEngine.cc
index e320a631d..6072eab8e 100644
--- a/FPEngine.cc
+++ b/FPEngine.cc
@@ -925,6 +925,28 @@ int label_prob_cmp(const void *a, const void *b) {
     return 0;
 }
 
+/* Return a measure of how much the given feature vector differs from the other
+   members of the class given by label.
+
+   This can be thought of as the distance from the given feature vector to the
+   mean of the class in multidimensional space, after scaling. Each dimension is
+   further scaled by the inverse of the sample variance of that feature. This is
+   an approximation of the Mahalanobis distance
+   (https://en.wikipedia.org/wiki/Mahalanobis_distance), which normally uses a
+   full covariance matrix of the features. If we take the features to be
+   pairwise independent (which they are not), then the covariance matrix is just
+   the diagonal matrix containing per-feature variances, leading to the same
+   calculation as is done below. Using only the per-feature variances rather
+   than covariance matrices is to save space; it requires only n entries per
+   class rather than n^2, where n is the length of a feature vector.
+
+   It happens often that a feature's variance is undefined (because there is
+   only one example in the class) or zero (because there are two identical
+   values for that feature). Both these cases are mapped to zero by train.py,
+   and we handle them the same way: by using a small default variance. This will
+   tend to make small differences count a lot (because we probably want this
+   fingerprint in order to expand the class), while still allowing near-perfect
+   matches to match. */
 static double novelty_of(const struct feature_node *features, int label) {
   const double *means, *variances;
   int i, nr_feature;