1 /**This module contains a small but growing library for performing kernel
2  * density estimation.
3  *
4  * Author:  David Simcha
5  */
6 /*
7  * License:
8  * Boost Software License - Version 1.0 - August 17th, 2003
9  *
10  * Permission is hereby granted, free of charge, to any person or organization
11  * obtaining a copy of the software and accompanying documentation covered by
12  * this license (the "Software") to use, reproduce, display, distribute,
13  * execute, and transmit the Software, and to prepare derivative works of the
14  * Software, and to permit third-parties to whom the Software is furnished to
15  * do so, all subject to the following:
16  *
17  * The copyright notices in the Software and this entire statement, including
18  * the above license grant, this restriction and the following disclaimer,
19  * must be included in all copies of the Software, in whole or in part, and
20  * all derivative works of the Software, unless such copies or derivative
21  * works are solely in the form of machine-executable object code generated by
22  * a source language processor.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30  * DEALINGS IN THE SOFTWARE.
31  */
32 module dstats.kerneldensity;
33 
34 import std.conv, std.math, std.exception, std.traits, std.range,
35     std.array, std.typetuple, dstats.distrib;
36 
37 import std.algorithm : min, max;
38 
39 import  dstats.alloc, dstats.base, dstats.summary;
40 
41 version(unittest) {
42     import dstats.random, std.stdio;
43 }
44 
45 /**Estimates densities in the 1-dimensional case.  The 1-D case is special
46  * enough to be treated as a special case, since it's very common and enables
47  * some significant optimizations that are otherwise not feasible.
48  *
49  * Under the hood, this works by binning the data into a large number of bins
50  * (currently 1,000), convolving it with the kernel function to smooth it, and
51  * then using linear interpolation to evaluate the density estimates.  This
52  * will produce results that are different from the textbook definition of
53  * kernel density estimation, but to an extent that's negligible in most cases.
54  * It also means that constructing this object is relatively expensive, but
55  * evaluating a density estimate can be done in O(1) time complexity afterwords.
56  */
57 class KernelDensity1D {
58 private:
59     immutable double[] bins;
60     immutable double[] cumulative;
61     immutable double minElem;
62     immutable double maxElem;
63     immutable double diffNeg1Nbin;
64 
65 
66     this(immutable double[] bins, immutable double[] cumulative,
67          double minElem, double maxElem) {
68         this.bins = bins;
69         this.cumulative = cumulative;
70         this.minElem = minElem;
71         this.maxElem = maxElem;
72         this.diffNeg1Nbin = bins.length / (maxElem - minElem);
73     }
74 
75     private static double findEdgeBuffer(C)(C kernel) {
76         // Search for the approx. point where the kernel's density is 0.001 *
77         // what it is at zero.
78         immutable zeroVal = kernel(0);
79         double ret = 1;
80 
81         double factor = 4;
82         double kernelVal;
83 
84         do {
85             while(kernel(ret) / zeroVal > 1e-3) {
86                 ret *= factor;
87             }
88 
89             factor = (factor - 1) / 2 + 1;
90             while(kernel(ret) / zeroVal < 1e-4) {
91                 ret /= factor;
92             }
93 
94             kernelVal = kernel(ret) / zeroVal;
95         } while((kernelVal > 1e-3 || kernelVal < 1e-4) && factor > 1);
96 
97         return ret;
98     }
99 
100 public:
101     /**Construct a kernel density estimation object from a callable object.
102      * R must be a range of numeric types.  C must be a kernel function,
103      * delegate, or class or struct with overloaded opCall.  The kernel
104      * function is assumed to be symmetric about zero, to take its maximum
105      * value at zero and to be unimodal.
106      *
107      * edgeBuffer determines how much space below and above the smallest and
108      * largest observed value will be allocated when doing the binning.
109      * Values less than reduce!min(range) - edgeBuffer or greater than
110      * reduce!max(range) + edgeBuffer will be assigned a density of zero.
111      * If this value is left at its default, it will be set to a value at which
112      * the kernel is somewhere between 1e-3 and 1e-4 times its value at zero.
113      *
114      * The bandwidth of the kernel is indirectly selected by parametrizing the
115      * kernel function.
116      *
117      * Examples:
118      * ---
119      * auto randNums = randArray!rNormal(1_000, 0, 1);
120      * auto kernel = parametrize!normalPDF(0, 0.01);
121      * auto density = KernelDensity1D(kernel, randNums);
122      * writeln(normalPDF(1, 0, 1), "  ", density(1)).  // Should be about the same.
123      * ---
124      */
125     static KernelDensity1D fromCallable(C, R)
126     (scope C kernel, R range, double edgeBuffer = double.nan)
127     if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) {
128         enum nBin = 1000;
129         auto alloc = newRegionAllocator();
130 
131         uint N = 0;
132         double minElem = double.infinity;
133         double maxElem = -double.infinity;
134         foreach(elem; range) {
135             minElem = min(minElem, elem);
136             maxElem = max(maxElem, elem);
137             N++;
138         }
139 
140         if(isNaN(edgeBuffer)) {
141             edgeBuffer = findEdgeBuffer(kernel);
142         }
143         minElem -= edgeBuffer;
144         maxElem += edgeBuffer;
145 
146         // Using ints here because they convert faster to floats than uints do.
147         auto binsRaw = alloc.uninitializedArray!(int[])(nBin);
148         binsRaw[] = 0;
149 
150         foreach(elemRaw; range) {
151             double elem = elemRaw - minElem;
152             elem /= (maxElem - minElem);
153             elem *= nBin;
154             auto bin = to!uint(elem);
155             if(bin == nBin) {
156                 bin--;
157             }
158 
159             binsRaw[bin]++;
160         }
161 
162         // Convolve the binned data with our kernel.  Since N is fairly small
163         // we'll use a simple O(N^2) algorithm.  According to my measurements,
164         // this is actually comparable in speed to using an FFT (and a lot
165         // simplier and more space efficient) because:
166         //
167         // 1.  We can take advantage of kernel symmetry.
168         //
169         // 2.  We can take advantage of the sparsity of binsRaw.  (We don't
170         //     need to convolve the zero count bins.)
171         //
172         // 3.  We don't need to do any zero padding to get a non-cyclic
173         //     convolution.
174         //
175         // 4.  We don't need to convolve the tails of the kernel function,
176         //     where the contribution to the final density estimate would be
177         //     negligible.
178         auto binsCooked = uninitializedArray!(double[])(nBin);
179         binsCooked[] = 0;
180 
181         auto kernelPoints = alloc.uninitializedArray!(double[])(nBin);
182         immutable stepSize = (maxElem - minElem) / nBin;
183 
184         kernelPoints[0] = kernel(0);
185         immutable stopAt = kernelPoints[0] * 1e-10;
186         foreach(ptrdiff_t i; 1..kernelPoints.length) {
187             kernelPoints[i] = kernel(stepSize * i);
188 
189             // Don't bother convolving stuff that contributes negligibly.
190             if(kernelPoints[i] < stopAt) {
191                 kernelPoints = kernelPoints[0..i];
192                 break;
193             }
194         }
195 
196         foreach(i, count; binsRaw) if(count > 0) {
197             binsCooked[i] += kernelPoints[0] * count;
198 
199             foreach(offset; 1..min(kernelPoints.length, max(i + 1, nBin - i))) {
200                 immutable kernelVal = kernelPoints[offset];
201 
202                 if(i >= offset) {
203                     binsCooked[i - offset] += kernelVal * count;
204                 }
205 
206                 if(i + offset < nBin) {
207                     binsCooked[i + offset] += kernelVal * count;
208                 }
209             }
210         }
211 
212         binsCooked[] /= sum(binsCooked);
213         binsCooked[] *= nBin / (maxElem - minElem);  // Make it a density.
214 
215         auto cumulative = uninitializedArray!(double[])(nBin);
216         cumulative[0] = binsCooked[0];
217         foreach(i; 1..nBin) {
218             cumulative[i] = cumulative[i - 1] + binsCooked[i];
219         }
220         cumulative[] /= cumulative[$ - 1];
221 
222         return new typeof(this)(
223             assumeUnique(binsCooked), assumeUnique(cumulative),
224             minElem, maxElem);
225     }
226 
227     /**Construct a kernel density estimator from an alias.*/
228     static KernelDensity1D fromAlias(alias kernel, R)
229     (R range, double edgeBuffer = double.nan)
230     if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) {
231         static double kernelFun(double x) {
232             return kernel(x);
233         }
234 
235         return fromCallable(&kernelFun, range, edgeBuffer);
236     }
237 
238     /**Construct a kernel density estimator using the default kernel, which is
239      * a Gaussian kernel with the Scott bandwidth.
240      */
241     static KernelDensity1D fromDefaultKernel(R)
242     (R range, double edgeBuffer = double.nan)
243     if(isForwardRange!R && is(ElementType!R : double)) {
244         immutable bandwidth = scottBandwidth(range.save);
245 
246         double kernel(double x) {
247             return normalPDF(x, 0, bandwidth);
248         }
249 
250         return fromCallable(&kernel, range, edgeBuffer);
251     }
252 
253     /**Compute the probability density at a given point.*/
254     double opCall(double x) const {
255         if(x < minElem || x > maxElem) {
256             return 0;
257         }
258 
259         x -= minElem;
260         x *= diffNeg1Nbin;
261 
262         immutable fract = x - floor(x);
263         immutable upper = to!size_t(ceil(x));
264         immutable lower = to!size_t(floor(x));
265 
266         if(upper == bins.length) {
267             return bins[$ - 1];
268         }
269 
270         immutable ret = fract * bins[upper] + (1 - fract) * bins[lower];
271         return max(0, ret);  // Compensate for roundoff
272     }
273 
274     /**Compute the cumulative density, i.e. the integral from -infinity to x.*/
275     double cdf(double x) const {
276         if(x <= minElem) {
277             return 0;
278         } else if(x >= maxElem) {
279             return 1;
280         }
281 
282         x -= minElem;
283         x *= diffNeg1Nbin;
284 
285         immutable fract = x - floor(x);
286         immutable upper = to!size_t(ceil(x));
287         immutable lower = to!size_t(floor(x));
288 
289         if(upper == cumulative.length) {
290             return 1;
291         }
292 
293         return fract * cumulative[upper] + (1 - fract) * cumulative[lower];
294     }
295 
296     /**Compute the cumulative density from the rhs, i.e. the integral from
297      * x to infinity.
298      */
299     double cdfr(double x) const {
300         // Here, we can get away with just returning 1 - cdf b/c
301         // there are inaccuracies several orders of magnitude bigger than
302         // the rounding error.
303         return 1.0 - cdf(x);
304     }
305 }
306 
307 unittest {
308     auto kde = KernelDensity1D.fromCallable(parametrize!normalPDF(0, 1), [0]);
309     assert(approxEqual(kde(1), normalPDF(1)));
310     assert(approxEqual(kde.cdf(1), normalCDF(1)));
311     assert(approxEqual(kde.cdfr(1), normalCDFR(1)));
312 
313     // This is purely to see if fromAlias works.
314     auto cosKde = KernelDensity1D.fromAlias!cos([0], 1);
315 
316     // Make sure fromDefaultKernel at least instantiates.
317     auto defaultKde = KernelDensity1D.fromDefaultKernel([1, 2, 3]);
318 }
319 
320 /**Uses Scott's Rule to select the bandwidth of the Gaussian kernel density
321  * estimator.  This is 1.06 * min(stdev(data), interquartileRange(data) / 1.34)
322  * N ^^ -0.2.  R must be a forward range of numeric types.
323  *
324  * Examples:
325  * ---
326  * immutable bandwidth = scottBandwidth(data);
327  * auto kernel = parametrize!normalPDF(0, bandwidth);
328  * auto kde = KernelDensity1D(data, kernel);
329  * ---
330  *
331  * References:
332  * Scott, D. W. (1992) Multivariate Density Estimation: Theory, Practice,
333  * and Visualization. Wiley.
334  */
335 double scottBandwidth(R)(R data)
336 if(isForwardRange!R && is(ElementType!R : double)) {
337 
338     immutable summary = meanStdev(data.save);
339     immutable interquartile = interquantileRange(data.save, 0.25) / 1.34;
340     immutable sigmaHat = min(summary.stdev, interquartile);
341 
342     return 1.06 * sigmaHat * (summary.N ^^ -0.2);
343 }
344 
345 unittest {
346     // Values from R.
347     assert(approxEqual(scottBandwidth([1,2,3,4,5]), 1.14666));
348     assert(approxEqual(scottBandwidth([1,2,2,2,2,8,8,8,8]), 2.242446));
349 }
350 
351 /**Construct an N-dimensional kernel density estimator.  This is done using
352  * the textbook definition of kernel density estimation, since the binning
353  * and convolving method used in the 1-D case would rapidly become
354  * unfeasible w.r.t. memory usage as dimensionality increased.
355  *
356  * Eventually, a 2-D estimator might be added as another special case, but
357  * beyond 2-D, bin-and-convolute clearly isn't feasible.
358  *
359  * This class can be used for 1-D estimation instead of KernelDensity1D, and
360  * will work properly.  This is useful if:
361  *
362  * 1.  You can't accept even the slightest deviation from the results that the
363  *     textbook definition of kernel density estimation would produce.
364  *
365  * 2.  You are only going to evaluate at a few points and want to avoid the
366  *     up-front cost of the convolution used in the 1-D case.
367  *
368  * 3.  You're using some weird kernel that doesn't meet the assumptions
369  *     required for KernelDensity1D.
370  */
371 class KernelDensity {
372     private immutable double[][] points;
373     private double delegate(double[]...) kernel;
374 
375     private this(immutable double[][] points) {
376         this.points = points;
377     }
378 
379     /**Returns the number of dimensions in the estimator.*/
380     uint nDimensions() const @property {
381         // More than uint.max dimensions is absolutely implausible.
382         assert(points.length <= uint.max);
383         return cast(uint) points.length;
384     }
385 
386     /**Construct a kernel density estimator from a kernel provided as a callable
387      * object (such as a function pointer, delegate, or class with overloaded
388      * opCall).  R must be either a range of ranges, multiple ranges passed in
389      * as variadic arguments, or a single range for the 1D case.  Each range
390      * represents the values of one variable in the joint distribution.
391      * kernel must accept either an array of doubles or the same number of
392      * arguments as the number of dimensions, and must return a floating point
393      * number.
394      *
395      * Examples:
396      * ---
397      * // Create an estimate of the density of the joint distribution of
398      * // hours sleep and programming skill.
399      * auto programmingSkill = [8,6,7,5,3,0,9];
400      * auto hoursSleep = [3,6,2,4,3,5,8];
401      *
402      * // Make a 2D Gaussian kernel function with bandwidth 0.5 in both
403      * // dimensions and covariance zero.
404      * static double myKernel(double x1, double x2) {
405      *    return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5);
406      * }
407      *
408      * auto estimator = KernelDensity.fromCallable
409      *     (&myKernel, programmingSkill, hoursSleep);
410      *
411      * // Estimate the density at programming skill 1, 2 hours sleep.
412      * auto density = estimator(1, 2);
413      * ---
414      */
415     static KernelDensity fromCallable(C, R...)(C kernel, R ranges)
416     if(allSatisfy!(isInputRange, R)) {
417         auto kernelWrapped = wrapToArrayVariadic(kernel);
418 
419         static if(R.length == 1 && isInputRange!(typeof(ranges[0].front))) {
420             alias ranges[0] data;
421         } else {
422             alias ranges data;
423         }
424 
425         double[][] points;
426         foreach(range; data) {
427             double[] asDoubles;
428 
429             static if(hasLength!(typeof(range))) {
430                 asDoubles = uninitializedArray!(double[])(range.length);
431 
432                 size_t i = 0;
433                 foreach(elem; range) {
434                     asDoubles[i++] = elem;
435                 }
436             } else {
437                 auto app = appender(&asDoubles);
438                 foreach(elem; range) {
439                     app.put(elem);
440                 }
441             }
442 
443             points ~= asDoubles;
444         }
445 
446         dstatsEnforce(points.length,
447             "Can't construct a zero dimensional kernel density estimator.");
448 
449         foreach(arr; points[1..$]) {
450             dstatsEnforce(arr.length == points[0].length,
451                 "All ranges must be the same length to construct a KernelDensity.");
452         }
453 
454         auto ret = new KernelDensity(assumeUnique(points));
455         ret.kernel = kernelWrapped;
456 
457         return ret;
458     }
459 
460     /**Estimate the density at the point given by x.  The variables in X are
461      * provided in the same order as the ranges were provided for estimation.
462      */
463     double opCall(double[] x...) const {
464         dstatsEnforce(x.length == points.length,
465             "Dimension mismatch when evaluating kernel density.");
466         double sum = 0;
467 
468         auto alloc = newRegionAllocator();
469         auto dataPoint = alloc.uninitializedArray!(double[])(points.length);
470         foreach(i; 0..points[0].length) {
471             foreach(j; 0..points.length) {
472                 dataPoint[j] = x[j] - points[j][i];
473             }
474 
475             sum += kernel(dataPoint);
476         }
477 
478         sum /= points[0].length;
479         return sum;
480     }
481 }
482 
483 unittest {
484     auto data = randArray!rNormal(100, 0, 1);
485     auto kernel = parametrize!normalPDF(0, scottBandwidth(data));
486     auto kde = KernelDensity.fromCallable(kernel, data);
487     auto kde1 = KernelDensity1D.fromCallable(kernel, data);
488     foreach(i; 0..5) {
489         assert(abs(kde(i) - kde1(i)) < 0.01);
490     }
491 
492     // Make sure example compiles.
493     auto programmingSkill = [8,6,7,5,3,0,9];
494     auto hoursSleep = [3,6,2,4,3,5,8];
495 
496     // Make a 2D Gaussian kernel function with bandwidth 0.5 in both
497     // dimensions and covariance zero.
498     static double myKernel(double x1, double x2) {
499         return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5);
500     }
501 
502     auto estimator = KernelDensity.fromCallable
503         (&myKernel, programmingSkill, hoursSleep);
504 
505     // Estimate the density at programming skill 1, 2 hours sleep.
506     auto density = estimator(1, 2);
507 
508     // Test instantiating from functor.
509     auto foo = KernelDensity.fromCallable(estimator, hoursSleep);
510 }
511 
512 
513 private:
514 
515 double delegate(double[]...) wrapToArrayVariadic(C)(C callable) {
516     static if(is(C == delegate) || isFunctionPointer!C) {
517         alias callable fun;
518     } else {  // It's a functor.
519         alias callable.opCall fun;
520     }
521 
522     alias ParameterTypeTuple!fun params;
523     static if(params.length == 1 && is(params[0] == double[])) {
524         // Already in the right form.
525         static if(is(C == delegate) && is(ReturnType!C == double)) {
526             return callable;
527         } else static if(is(ReturnType!(callable.opCall) == double)) {
528             return &callable.opCall;
529         } else {  // Need to forward.
530             double forward(double[] args...) {
531                 return fun(args);
532             }
533 
534             return &forward;
535         }
536     } else {  // Need to convert to single arguments and forward.
537         static assert(allSatisfy!(isFloatingPoint, params));
538 
539         double doCall(double[] args...) {
540             assert(args.length == params.length);
541             mixin("return fun(" ~ makeCallList(params.length) ~ ");");
542         }
543 
544         return &doCall;
545     }
546 }
547 
548 // CTFE function for forwarding elements of an array as single function
549 // arguments.
550 string makeCallList(uint N) {
551     string ret;
552     foreach(i; 0..N - 1) {
553         ret ~= "args[" ~ to!string(i) ~ "], ";
554     }
555 
556     ret ~= "args[" ~ to!string(N - 1) ~ "]";
557     return ret;
558 }