1 /**This module contains a small but growing library for performing kernel 2 * density estimation. 3 * 4 * Author: David Simcha 5 */ 6 /* 7 * License: 8 * Boost Software License - Version 1.0 - August 17th, 2003 9 * 10 * Permission is hereby granted, free of charge, to any person or organization 11 * obtaining a copy of the software and accompanying documentation covered by 12 * this license (the "Software") to use, reproduce, display, distribute, 13 * execute, and transmit the Software, and to prepare derivative works of the 14 * Software, and to permit third-parties to whom the Software is furnished to 15 * do so, all subject to the following: 16 * 17 * The copyright notices in the Software and this entire statement, including 18 * the above license grant, this restriction and the following disclaimer, 19 * must be included in all copies of the Software, in whole or in part, and 20 * all derivative works of the Software, unless such copies or derivative 21 * works are solely in the form of machine-executable object code generated by 22 * a source language processor. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 * DEALINGS IN THE SOFTWARE. 31 */ 32 module dstats.kerneldensity; 33 34 import std.conv, std.math, std.exception, std.traits, std.range, 35 std.array, std.typetuple, dstats.distrib; 36 37 import std.algorithm : min, max; 38 39 import dstats.alloc, dstats.base, dstats.summary; 40 41 version(unittest) { 42 import dstats.random, std.stdio; 43 } 44 45 /**Estimates densities in the 1-dimensional case. The 1-D case is special 46 * enough to be treated as a special case, since it's very common and enables 47 * some significant optimizations that are otherwise not feasible. 48 * 49 * Under the hood, this works by binning the data into a large number of bins 50 * (currently 1,000), convolving it with the kernel function to smooth it, and 51 * then using linear interpolation to evaluate the density estimates. This 52 * will produce results that are different from the textbook definition of 53 * kernel density estimation, but to an extent that's negligible in most cases. 54 * It also means that constructing this object is relatively expensive, but 55 * evaluating a density estimate can be done in O(1) time complexity afterwords. 56 */ 57 class KernelDensity1D { 58 private: 59 immutable double[] bins; 60 immutable double[] cumulative; 61 immutable double minElem; 62 immutable double maxElem; 63 immutable double diffNeg1Nbin; 64 65 66 this(immutable double[] bins, immutable double[] cumulative, 67 double minElem, double maxElem) { 68 this.bins = bins; 69 this.cumulative = cumulative; 70 this.minElem = minElem; 71 this.maxElem = maxElem; 72 this.diffNeg1Nbin = bins.length / (maxElem - minElem); 73 } 74 75 private static double findEdgeBuffer(C)(C kernel) { 76 // Search for the approx. point where the kernel's density is 0.001 * 77 // what it is at zero. 78 immutable zeroVal = kernel(0); 79 double ret = 1; 80 81 double factor = 4; 82 double kernelVal; 83 84 do { 85 while(kernel(ret) / zeroVal > 1e-3) { 86 ret *= factor; 87 } 88 89 factor = (factor - 1) / 2 + 1; 90 while(kernel(ret) / zeroVal < 1e-4) { 91 ret /= factor; 92 } 93 94 kernelVal = kernel(ret) / zeroVal; 95 } while((kernelVal > 1e-3 || kernelVal < 1e-4) && factor > 1); 96 97 return ret; 98 } 99 100 public: 101 /**Construct a kernel density estimation object from a callable object. 102 * R must be a range of numeric types. C must be a kernel function, 103 * delegate, or class or struct with overloaded opCall. The kernel 104 * function is assumed to be symmetric about zero, to take its maximum 105 * value at zero and to be unimodal. 106 * 107 * edgeBuffer determines how much space below and above the smallest and 108 * largest observed value will be allocated when doing the binning. 109 * Values less than reduce!min(range) - edgeBuffer or greater than 110 * reduce!max(range) + edgeBuffer will be assigned a density of zero. 111 * If this value is left at its default, it will be set to a value at which 112 * the kernel is somewhere between 1e-3 and 1e-4 times its value at zero. 113 * 114 * The bandwidth of the kernel is indirectly selected by parametrizing the 115 * kernel function. 116 * 117 * Examples: 118 * --- 119 * auto randNums = randArray!rNormal(1_000, 0, 1); 120 * auto kernel = parametrize!normalPDF(0, 0.01); 121 * auto density = KernelDensity1D(kernel, randNums); 122 * writeln(normalPDF(1, 0, 1), " ", density(1)). // Should be about the same. 123 * --- 124 */ 125 static KernelDensity1D fromCallable(C, R) 126 (scope C kernel, R range, double edgeBuffer = double.nan) 127 if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) { 128 enum nBin = 1000; 129 auto alloc = newRegionAllocator(); 130 131 uint N = 0; 132 double minElem = double.infinity; 133 double maxElem = -double.infinity; 134 foreach(elem; range) { 135 minElem = min(minElem, elem); 136 maxElem = max(maxElem, elem); 137 N++; 138 } 139 140 if(isNaN(edgeBuffer)) { 141 edgeBuffer = findEdgeBuffer(kernel); 142 } 143 minElem -= edgeBuffer; 144 maxElem += edgeBuffer; 145 146 // Using ints here because they convert faster to floats than uints do. 147 auto binsRaw = alloc.uninitializedArray!(int[])(nBin); 148 binsRaw[] = 0; 149 150 foreach(elemRaw; range) { 151 double elem = elemRaw - minElem; 152 elem /= (maxElem - minElem); 153 elem *= nBin; 154 auto bin = to!uint(elem); 155 if(bin == nBin) { 156 bin--; 157 } 158 159 binsRaw[bin]++; 160 } 161 162 // Convolve the binned data with our kernel. Since N is fairly small 163 // we'll use a simple O(N^2) algorithm. According to my measurements, 164 // this is actually comparable in speed to using an FFT (and a lot 165 // simplier and more space efficient) because: 166 // 167 // 1. We can take advantage of kernel symmetry. 168 // 169 // 2. We can take advantage of the sparsity of binsRaw. (We don't 170 // need to convolve the zero count bins.) 171 // 172 // 3. We don't need to do any zero padding to get a non-cyclic 173 // convolution. 174 // 175 // 4. We don't need to convolve the tails of the kernel function, 176 // where the contribution to the final density estimate would be 177 // negligible. 178 auto binsCooked = uninitializedArray!(double[])(nBin); 179 binsCooked[] = 0; 180 181 auto kernelPoints = alloc.uninitializedArray!(double[])(nBin); 182 immutable stepSize = (maxElem - minElem) / nBin; 183 184 kernelPoints[0] = kernel(0); 185 immutable stopAt = kernelPoints[0] * 1e-10; 186 foreach(ptrdiff_t i; 1..kernelPoints.length) { 187 kernelPoints[i] = kernel(stepSize * i); 188 189 // Don't bother convolving stuff that contributes negligibly. 190 if(kernelPoints[i] < stopAt) { 191 kernelPoints = kernelPoints[0..i]; 192 break; 193 } 194 } 195 196 foreach(i, count; binsRaw) if(count > 0) { 197 binsCooked[i] += kernelPoints[0] * count; 198 199 foreach(offset; 1..min(kernelPoints.length, max(i + 1, nBin - i))) { 200 immutable kernelVal = kernelPoints[offset]; 201 202 if(i >= offset) { 203 binsCooked[i - offset] += kernelVal * count; 204 } 205 206 if(i + offset < nBin) { 207 binsCooked[i + offset] += kernelVal * count; 208 } 209 } 210 } 211 212 binsCooked[] /= sum(binsCooked); 213 binsCooked[] *= nBin / (maxElem - minElem); // Make it a density. 214 215 auto cumulative = uninitializedArray!(double[])(nBin); 216 cumulative[0] = binsCooked[0]; 217 foreach(i; 1..nBin) { 218 cumulative[i] = cumulative[i - 1] + binsCooked[i]; 219 } 220 cumulative[] /= cumulative[$ - 1]; 221 222 return new typeof(this)( 223 assumeUnique(binsCooked), assumeUnique(cumulative), 224 minElem, maxElem); 225 } 226 227 /**Construct a kernel density estimator from an alias.*/ 228 static KernelDensity1D fromAlias(alias kernel, R) 229 (R range, double edgeBuffer = double.nan) 230 if(isForwardRange!R && is(typeof(kernel(2.0)) : double)) { 231 static double kernelFun(double x) { 232 return kernel(x); 233 } 234 235 return fromCallable(&kernelFun, range, edgeBuffer); 236 } 237 238 /**Construct a kernel density estimator using the default kernel, which is 239 * a Gaussian kernel with the Scott bandwidth. 240 */ 241 static KernelDensity1D fromDefaultKernel(R) 242 (R range, double edgeBuffer = double.nan) 243 if(isForwardRange!R && is(ElementType!R : double)) { 244 immutable bandwidth = scottBandwidth(range.save); 245 246 double kernel(double x) { 247 return normalPDF(x, 0, bandwidth); 248 } 249 250 return fromCallable(&kernel, range, edgeBuffer); 251 } 252 253 /**Compute the probability density at a given point.*/ 254 double opCall(double x) const { 255 if(x < minElem || x > maxElem) { 256 return 0; 257 } 258 259 x -= minElem; 260 x *= diffNeg1Nbin; 261 262 immutable fract = x - floor(x); 263 immutable upper = to!size_t(ceil(x)); 264 immutable lower = to!size_t(floor(x)); 265 266 if(upper == bins.length) { 267 return bins[$ - 1]; 268 } 269 270 immutable ret = fract * bins[upper] + (1 - fract) * bins[lower]; 271 return max(0, ret); // Compensate for roundoff 272 } 273 274 /**Compute the cumulative density, i.e. the integral from -infinity to x.*/ 275 double cdf(double x) const { 276 if(x <= minElem) { 277 return 0; 278 } else if(x >= maxElem) { 279 return 1; 280 } 281 282 x -= minElem; 283 x *= diffNeg1Nbin; 284 285 immutable fract = x - floor(x); 286 immutable upper = to!size_t(ceil(x)); 287 immutable lower = to!size_t(floor(x)); 288 289 if(upper == cumulative.length) { 290 return 1; 291 } 292 293 return fract * cumulative[upper] + (1 - fract) * cumulative[lower]; 294 } 295 296 /**Compute the cumulative density from the rhs, i.e. the integral from 297 * x to infinity. 298 */ 299 double cdfr(double x) const { 300 // Here, we can get away with just returning 1 - cdf b/c 301 // there are inaccuracies several orders of magnitude bigger than 302 // the rounding error. 303 return 1.0 - cdf(x); 304 } 305 } 306 307 unittest { 308 auto kde = KernelDensity1D.fromCallable(parametrize!normalPDF(0, 1), [0]); 309 assert(approxEqual(kde(1), normalPDF(1))); 310 assert(approxEqual(kde.cdf(1), normalCDF(1))); 311 assert(approxEqual(kde.cdfr(1), normalCDFR(1))); 312 313 // This is purely to see if fromAlias works. 314 auto cosKde = KernelDensity1D.fromAlias!cos([0], 1); 315 316 // Make sure fromDefaultKernel at least instantiates. 317 auto defaultKde = KernelDensity1D.fromDefaultKernel([1, 2, 3]); 318 } 319 320 /**Uses Scott's Rule to select the bandwidth of the Gaussian kernel density 321 * estimator. This is 1.06 * min(stdev(data), interquartileRange(data) / 1.34) 322 * N ^^ -0.2. R must be a forward range of numeric types. 323 * 324 * Examples: 325 * --- 326 * immutable bandwidth = scottBandwidth(data); 327 * auto kernel = parametrize!normalPDF(0, bandwidth); 328 * auto kde = KernelDensity1D(data, kernel); 329 * --- 330 * 331 * References: 332 * Scott, D. W. (1992) Multivariate Density Estimation: Theory, Practice, 333 * and Visualization. Wiley. 334 */ 335 double scottBandwidth(R)(R data) 336 if(isForwardRange!R && is(ElementType!R : double)) { 337 338 immutable summary = meanStdev(data.save); 339 immutable interquartile = interquantileRange(data.save, 0.25) / 1.34; 340 immutable sigmaHat = min(summary.stdev, interquartile); 341 342 return 1.06 * sigmaHat * (summary.N ^^ -0.2); 343 } 344 345 unittest { 346 // Values from R. 347 assert(approxEqual(scottBandwidth([1,2,3,4,5]), 1.14666)); 348 assert(approxEqual(scottBandwidth([1,2,2,2,2,8,8,8,8]), 2.242446)); 349 } 350 351 /**Construct an N-dimensional kernel density estimator. This is done using 352 * the textbook definition of kernel density estimation, since the binning 353 * and convolving method used in the 1-D case would rapidly become 354 * unfeasible w.r.t. memory usage as dimensionality increased. 355 * 356 * Eventually, a 2-D estimator might be added as another special case, but 357 * beyond 2-D, bin-and-convolute clearly isn't feasible. 358 * 359 * This class can be used for 1-D estimation instead of KernelDensity1D, and 360 * will work properly. This is useful if: 361 * 362 * 1. You can't accept even the slightest deviation from the results that the 363 * textbook definition of kernel density estimation would produce. 364 * 365 * 2. You are only going to evaluate at a few points and want to avoid the 366 * up-front cost of the convolution used in the 1-D case. 367 * 368 * 3. You're using some weird kernel that doesn't meet the assumptions 369 * required for KernelDensity1D. 370 */ 371 class KernelDensity { 372 private immutable double[][] points; 373 private double delegate(double[]...) kernel; 374 375 private this(immutable double[][] points) { 376 this.points = points; 377 } 378 379 /**Returns the number of dimensions in the estimator.*/ 380 uint nDimensions() const @property { 381 // More than uint.max dimensions is absolutely implausible. 382 assert(points.length <= uint.max); 383 return cast(uint) points.length; 384 } 385 386 /**Construct a kernel density estimator from a kernel provided as a callable 387 * object (such as a function pointer, delegate, or class with overloaded 388 * opCall). R must be either a range of ranges, multiple ranges passed in 389 * as variadic arguments, or a single range for the 1D case. Each range 390 * represents the values of one variable in the joint distribution. 391 * kernel must accept either an array of doubles or the same number of 392 * arguments as the number of dimensions, and must return a floating point 393 * number. 394 * 395 * Examples: 396 * --- 397 * // Create an estimate of the density of the joint distribution of 398 * // hours sleep and programming skill. 399 * auto programmingSkill = [8,6,7,5,3,0,9]; 400 * auto hoursSleep = [3,6,2,4,3,5,8]; 401 * 402 * // Make a 2D Gaussian kernel function with bandwidth 0.5 in both 403 * // dimensions and covariance zero. 404 * static double myKernel(double x1, double x2) { 405 * return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5); 406 * } 407 * 408 * auto estimator = KernelDensity.fromCallable 409 * (&myKernel, programmingSkill, hoursSleep); 410 * 411 * // Estimate the density at programming skill 1, 2 hours sleep. 412 * auto density = estimator(1, 2); 413 * --- 414 */ 415 static KernelDensity fromCallable(C, R...)(C kernel, R ranges) 416 if(allSatisfy!(isInputRange, R)) { 417 auto kernelWrapped = wrapToArrayVariadic(kernel); 418 419 static if(R.length == 1 && isInputRange!(typeof(ranges[0].front))) { 420 alias ranges[0] data; 421 } else { 422 alias ranges data; 423 } 424 425 double[][] points; 426 foreach(range; data) { 427 double[] asDoubles; 428 429 static if(hasLength!(typeof(range))) { 430 asDoubles = uninitializedArray!(double[])(range.length); 431 432 size_t i = 0; 433 foreach(elem; range) { 434 asDoubles[i++] = elem; 435 } 436 } else { 437 auto app = appender(&asDoubles); 438 foreach(elem; range) { 439 app.put(elem); 440 } 441 } 442 443 points ~= asDoubles; 444 } 445 446 dstatsEnforce(points.length, 447 "Can't construct a zero dimensional kernel density estimator."); 448 449 foreach(arr; points[1..$]) { 450 dstatsEnforce(arr.length == points[0].length, 451 "All ranges must be the same length to construct a KernelDensity."); 452 } 453 454 auto ret = new KernelDensity(assumeUnique(points)); 455 ret.kernel = kernelWrapped; 456 457 return ret; 458 } 459 460 /**Estimate the density at the point given by x. The variables in X are 461 * provided in the same order as the ranges were provided for estimation. 462 */ 463 double opCall(double[] x...) const { 464 dstatsEnforce(x.length == points.length, 465 "Dimension mismatch when evaluating kernel density."); 466 double sum = 0; 467 468 auto alloc = newRegionAllocator(); 469 auto dataPoint = alloc.uninitializedArray!(double[])(points.length); 470 foreach(i; 0..points[0].length) { 471 foreach(j; 0..points.length) { 472 dataPoint[j] = x[j] - points[j][i]; 473 } 474 475 sum += kernel(dataPoint); 476 } 477 478 sum /= points[0].length; 479 return sum; 480 } 481 } 482 483 unittest { 484 auto data = randArray!rNormal(100, 0, 1); 485 auto kernel = parametrize!normalPDF(0, scottBandwidth(data)); 486 auto kde = KernelDensity.fromCallable(kernel, data); 487 auto kde1 = KernelDensity1D.fromCallable(kernel, data); 488 foreach(i; 0..5) { 489 assert(abs(kde(i) - kde1(i)) < 0.01); 490 } 491 492 // Make sure example compiles. 493 auto programmingSkill = [8,6,7,5,3,0,9]; 494 auto hoursSleep = [3,6,2,4,3,5,8]; 495 496 // Make a 2D Gaussian kernel function with bandwidth 0.5 in both 497 // dimensions and covariance zero. 498 static double myKernel(double x1, double x2) { 499 return normalPDF(x1, 0, 0.5) * normalPDF(x2, 0, 0.5); 500 } 501 502 auto estimator = KernelDensity.fromCallable 503 (&myKernel, programmingSkill, hoursSleep); 504 505 // Estimate the density at programming skill 1, 2 hours sleep. 506 auto density = estimator(1, 2); 507 508 // Test instantiating from functor. 509 auto foo = KernelDensity.fromCallable(estimator, hoursSleep); 510 } 511 512 513 private: 514 515 double delegate(double[]...) wrapToArrayVariadic(C)(C callable) { 516 static if(is(C == delegate) || isFunctionPointer!C) { 517 alias callable fun; 518 } else { // It's a functor. 519 alias callable.opCall fun; 520 } 521 522 alias ParameterTypeTuple!fun params; 523 static if(params.length == 1 && is(params[0] == double[])) { 524 // Already in the right form. 525 static if(is(C == delegate) && is(ReturnType!C == double)) { 526 return callable; 527 } else static if(is(ReturnType!(callable.opCall) == double)) { 528 return &callable.opCall; 529 } else { // Need to forward. 530 double forward(double[] args...) { 531 return fun(args); 532 } 533 534 return &forward; 535 } 536 } else { // Need to convert to single arguments and forward. 537 static assert(allSatisfy!(isFloatingPoint, params)); 538 539 double doCall(double[] args...) { 540 assert(args.length == params.length); 541 mixin("return fun(" ~ makeCallList(params.length) ~ ");"); 542 } 543 544 return &doCall; 545 } 546 } 547 548 // CTFE function for forwarding elements of an array as single function 549 // arguments. 550 string makeCallList(uint N) { 551 string ret; 552 foreach(i; 0..N - 1) { 553 ret ~= "args[" ~ to!string(i) ~ "], "; 554 } 555 556 ret ~= "args[" ~ to!string(N - 1) ~ "]"; 557 return ret; 558 }