1 /**A module for performing linear regression. This module has an unusual 2 * interface, as it is range-based instead of matrix based. Values for 3 * independent variables are provided as either a tuple or a range of ranges. 4 * This means that one can use, for example, map, to fit high order models and 5 * lazily evaluate certain values. (For details, see examples below.) 6 * 7 * Author: David Simcha*/ 8 /* 9 * License: 10 * Boost Software License - Version 1.0 - August 17th, 2003 11 * 12 * Permission is hereby granted, free of charge, to any person or organization 13 * obtaining a copy of the software and accompanying documentation covered by 14 * this license (the "Software") to use, reproduce, display, distribute, 15 * execute, and transmit the Software, and to prepare derivative works of the 16 * Software, and to permit third-parties to whom the Software is furnished to 17 * do so, all subject to the following: 18 * 19 * The copyright notices in the Software and this entire statement, including 20 * the above license grant, this restriction and the following disclaimer, 21 * must be included in all copies of the Software, in whole or in part, and 22 * all derivative works of the Software, unless such copies or derivative 23 * works are solely in the form of machine-executable object code generated by 24 * a source language processor. 25 * 26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 28 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 29 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 30 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 31 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 32 * DEALINGS IN THE SOFTWARE. 33 */ 34 module dstats.regress; 35 36 import std.math, std.traits, std.array, std.traits, std..string, 37 std.exception, std.typetuple, std.typecons, std.numeric, std.parallelism; 38 39 import std.algorithm : map, copy, max, min, filter, reduce; 40 41 alias std.range.repeat repeat; 42 43 import dstats.alloc, std.range, std.conv, dstats.distrib, dstats.cor, 44 dstats.base, dstats.summary, dstats.sort; 45 46 version(unittest) { 47 import std.stdio, dstats.random, std.functional; 48 } 49 50 /// 51 struct PowMap(ExpType, T) 52 if(isForwardRange!(T)) { 53 Unqual!T range; 54 Unqual!ExpType exponent; 55 double cache; 56 57 this(T range, ExpType exponent) { 58 this.range = range; 59 this.exponent = exponent; 60 61 static if(isIntegral!ExpType) { 62 cache = pow(cast(double) range.front, exponent); 63 } else { 64 cache = pow(cast(ExpType) range.front, exponent); 65 } 66 } 67 68 @property double front() const pure nothrow { 69 return cache; 70 } 71 72 void popFront() { 73 range.popFront; 74 if(!range.empty) { 75 cache = pow(cast(double) range.front, exponent); 76 } 77 } 78 79 @property typeof(this) save() { 80 return this; 81 } 82 83 @property bool empty() { 84 return range.empty; 85 } 86 } 87 88 /**Maps a forward range to a power determined at runtime. ExpType is the type 89 * of the exponent. Using an int is faster than using a double, but obviously 90 * less flexible.*/ 91 PowMap!(ExpType, T) powMap(ExpType, T)(T range, ExpType exponent) { 92 alias PowMap!(ExpType, T) RT; 93 return RT(range, exponent); 94 } 95 96 // Very ad-hoc, does a bunch of matrix ops for linearRegress and 97 // linearRegressBeta. Specifically, computes xTx and xTy. 98 // Written specifically to be efficient in the context used here. 99 private void rangeMatrixMulTrans(U, T...)( 100 ref double[] xTy, 101 ref DoubleMatrix xTx, 102 U y, ref T matIn, 103 RegionAllocator alloc 104 ) { 105 static if(isArray!(T[0]) && 106 isInputRange!(typeof(matIn[0][0])) && matIn.length == 1) { 107 alias matIn[0] mat; 108 } else { 109 alias matIn mat; 110 } 111 112 bool someEmpty() { 113 if(y.empty) { 114 return true; 115 } 116 foreach(range; mat) { 117 if(range.empty) { 118 return true; 119 } 120 } 121 return false; 122 } 123 124 void popAll() { 125 foreach(ti, range; mat) { 126 mat[ti].popFront; 127 } 128 y.popFront; 129 } 130 131 xTy[] = 0; 132 xTx = doubleMatrix(mat.length, mat.length, alloc); 133 foreach(i; 0..mat.length) foreach(j; 0..mat.length) { 134 xTx[i, j] = 0; 135 } 136 137 auto alloc2 = newRegionAllocator(); 138 auto deltas = alloc2.uninitializedArray!(double[])(mat.length); 139 140 // Using an algorithm similar to the one for Pearson cor to improve 141 // numerical stability. Calculate means and covariances, then 142 // combine them: Sum of squares = mean1 * N * mean2 + N * cov. 143 double k = 0; 144 double[] means = alloc2.uninitializedArray!(double[])(mat.length); 145 means[] = 0; 146 double yMean = 0; 147 148 while(!someEmpty) { 149 foreach(i, elem; mat) { 150 deltas[i] = cast(double) elem.front; 151 } 152 153 immutable kMinus1 = k; 154 immutable kNeg1 = 1 / ++k; 155 deltas[] -= means[]; 156 means[] += deltas[] * kNeg1; 157 158 immutable yDelta = cast(double) y.front - yMean; 159 yMean += yDelta * kNeg1; 160 161 foreach(i, delta1; deltas) { 162 xTy[i] += kMinus1 * delta1 * kNeg1 * yDelta; 163 xTx[i, i] += kMinus1 * delta1 * kNeg1 * delta1; 164 165 foreach(j, delta2; deltas[0..i]) { 166 xTx[i, j] += kMinus1 * delta1 * kNeg1 * delta2; 167 } 168 } 169 popAll(); 170 } 171 172 // k is n now that we're done looping over the data. 173 alias k n; 174 175 // mat now consists of covariance * n. Divide by n and add mean1 * mean2 176 // to get sum of products. 177 foreach(i; 0..xTx.rows) foreach(j; 0..i + 1) { 178 xTx[i, j] /= n; 179 xTx[i, j] += means[i] * means[j]; 180 } 181 182 // Similarly for the xTy vector 183 foreach(i, ref elem; xTy) { 184 xTy[i] /= n; 185 elem += yMean * means[i]; 186 } 187 symmetrize(xTx); 188 } 189 190 // Copies values from lower triangle to upper triangle. 191 private void symmetrize(ref DoubleMatrix mat) { 192 foreach(i; 1..mat.rows) { 193 foreach(j; 0..i) { 194 mat[j, i] = mat[i, j]; 195 } 196 } 197 } 198 199 /**Struct that holds the results of a linear regression. It's a plain old 200 * data struct.*/ 201 struct RegressRes { 202 /**The coefficients, one for each range in X. These will be in the order 203 * that the X ranges were passed in.*/ 204 double[] betas; 205 206 /**The standard error terms of the X ranges passed in.*/ 207 double[] stdErr; 208 209 /**The lower confidence bounds of the beta terms, at the confidence level 210 * specificied. (Default 0.95).*/ 211 double[] lowerBound; 212 213 /**The upper confidence bounds of the beta terms, at the confidence level 214 * specificied. (Default 0.95).*/ 215 double[] upperBound; 216 217 /**The P-value for the alternative that the corresponding beta value is 218 * different from zero against the null that it is equal to zero.*/ 219 double[] p; 220 221 /**The coefficient of determination.*/ 222 double R2; 223 224 /**The adjusted coefficient of determination.*/ 225 double adjustedR2; 226 227 /**The root mean square of the residuals.*/ 228 double residualError; 229 230 /**The P-value for the model as a whole. Based on an F-statistic. The 231 * null here is that the model has no predictive value, the alternative 232 * is that it does.*/ 233 double overallP; 234 235 // Just used internally. 236 private static string arrStr(T)(T arr) { 237 return text(arr)[1..$ - 1]; 238 } 239 240 /**Print out the results in the default format.*/ 241 string toString() { 242 return "Betas: " ~ arrStr(betas) ~ "\nLower Conf. Int.: " ~ 243 arrStr(lowerBound) ~ "\nUpper Conf. Int.: " ~ arrStr(upperBound) ~ 244 "\nStd. Err: " ~ arrStr(stdErr) ~ "\nP Values: " ~ arrStr(p) ~ 245 "\nR^2: " ~ text(R2) ~ 246 "\nAdjusted R^2: " ~ text(adjustedR2) ~ 247 "\nStd. Residual Error: " ~ text(residualError) 248 ~ "\nOverall P: " ~ text(overallP); 249 } 250 } 251 252 /**Forward Range for holding the residuals from a regression analysis.*/ 253 struct Residuals(F, U, T...) { 254 static if(T.length == 1 && isForwardRange!(ElementType!(T[0]))) { 255 alias T[0] R; 256 alias typeof(array(R.init)) XType; 257 enum bool needDup = true; 258 } else { 259 alias T R; 260 alias staticMap!(Unqual, R) XType; 261 enum bool needDup = false; 262 } 263 264 Unqual!U Y; 265 XType X; 266 F[] betas; 267 double residual; 268 bool _empty; 269 270 void nextResidual() { 271 double sum = 0; 272 size_t i = 0; 273 foreach(elem; X) { 274 double frnt = elem.front; 275 sum += frnt * betas[i]; 276 i++; 277 } 278 residual = Y.front - sum; 279 } 280 281 this(F[] betas, U Y, R X) { 282 static if(is(typeof(X.length))) { 283 dstatsEnforce(X.length == betas.length, 284 "Betas and X must have same length for residuals."); 285 } else { 286 dstatsEnforce(walkLength(X) == betas.length, 287 "Betas and X must have same length for residuals."); 288 } 289 290 static if(needDup) { 291 this.X = array(X); 292 } else { 293 this.X = X; 294 } 295 296 foreach(i, elem; this.X) { 297 static if(isForwardRange!(typeof(elem))) { 298 this.X[i] = this.X[i].save; 299 } 300 } 301 302 this.Y = Y; 303 this.betas = betas; 304 if(Y.empty) { 305 _empty = true; 306 return; 307 } 308 foreach(elem; X) { 309 if(elem.empty) { 310 _empty = true; 311 return; 312 } 313 } 314 nextResidual; 315 } 316 317 @property double front() const pure nothrow { 318 return residual; 319 } 320 321 void popFront() { 322 Y.popFront; 323 if(Y.empty) { 324 _empty = true; 325 return; 326 } 327 foreach(ti, elem; X) { 328 X[ti].popFront; 329 if(X[ti].empty) { 330 _empty = true; 331 return; 332 } 333 } 334 nextResidual; 335 } 336 337 @property bool empty() const pure nothrow { 338 return _empty; 339 } 340 341 @property typeof(this) save() { 342 auto ret = this; 343 ret.Y = ret.Y.save; 344 foreach(ti, xElem; ret.X) { 345 ret.X[ti] = ret.X[ti].save; 346 } 347 348 return ret; 349 } 350 } 351 352 /**Given the beta coefficients from a linear regression, and X and Y values, 353 * returns a range that lazily computes the residuals. 354 */ 355 Residuals!(F, U, T) residuals(F, U, T...)(F[] betas, U Y, T X) 356 if(isFloatingPoint!F && isForwardRange!U && allSatisfy!(isForwardRange, T)) { 357 alias Residuals!(F, U, T) RT; 358 return RT(betas, Y, X); 359 } 360 361 // Compiles summary statistics while iterating, to allow ridge regression over 362 // input ranges. 363 private struct SummaryIter(R) { 364 R range; 365 MeanSD summ; 366 367 this(R range) { 368 this.range = range; 369 } 370 371 double front() @property { 372 return range.front; 373 } 374 375 void popFront() { 376 summ.put(range.front); 377 range.popFront(); 378 } 379 380 bool empty() @property { 381 return range.empty; 382 } 383 384 double mse() @property const pure nothrow { return summ.mse; } 385 386 double N() @property const pure nothrow { return summ.N; } 387 } 388 389 private template SummaryType(R) { 390 alias SummaryIter!R SummaryType; 391 } 392 393 /** 394 Perform a linear regression and return just the beta values. The advantages 395 to just returning the beta values are that it's faster and that each range 396 needs to be iterated over only once, and thus can be just an input range. 397 The beta values are returned such that the smallest index corresponds to 398 the leftmost element of X. X can be either a tuple or a range of input 399 ranges. Y must be an input range. 400 401 If, after all X variables are passed in, a numeric type is passed as the last 402 parameter, this is treated as a ridge parameter and ridge regression is 403 performed. Ridge regression is a form of regression that penalizes the L2 norm 404 of the beta vector and therefore results in more parsimonious models. 405 However, it makes statistical inference such as that supported by 406 linearRegress() difficult to impossible. Therefore, linearRegress() doesn't 407 support ridges. 408 409 If no ridge parameter is passed, or equivalently if the ridge parameter is 410 zero, then ordinary least squares regression is performed. 411 412 Notes: The X ranges are traversed in lockstep, but the traversal is stopped 413 at the end of the shortest one. Therefore, using infinite ranges is safe. 414 For example, using repeat(1) to get an intercept term works. 415 416 References: 417 418 http://www.mathworks.com/help/toolbox/stats/ridge.html 419 420 Venables, W. N. & Ripley, B. D. (2002) Modern Applied Statistics with S. 421 Fourth Edition. Springer, New York. ISBN 0-387-95457-0 422 (This is the citation for the MASS R package.) 423 424 Examples: 425 --- 426 int[] nBeers = [8,6,7,5,3,0,9]; 427 int[] nCoffees = [3,6,2,4,3,6,8]; 428 int[] musicVolume = [3,1,4,1,5,9,2]; 429 int[] programmingSkill = [2,7,1,8,2,8,1]; 430 double[] betas = linearRegressBeta(programmingSkill, repeat(1), nBeers, nCoffees, 431 musicVolume, map!"a * a"(musicVolume)); 432 433 // Now throw in a ridge parameter of 2.5. 434 double[] ridgeBetas = linearRegressBeta(programmingSkill, repeat(1), nBeers, 435 nCoffees, musicVolume, map!"a * a"(musicVolume), 2.5); 436 --- 437 */ 438 double[] linearRegressBeta(U, T...)(U Y, T XIn) 439 if(doubleInput!(U)) { 440 double[] dummy; 441 return linearRegressBetaBuf!(U, T)(dummy, Y, XIn); 442 } 443 444 /** 445 Same as linearRegressBeta, but allows the user to specify a buffer for 446 the beta terms. If the buffer is too short, a new one is allocated. 447 Otherwise, the results are returned in the user-provided buffer. 448 */ 449 double[] linearRegressBetaBuf(U, TRidge...)(double[] buf, U Y, TRidge XRidge) 450 if(doubleInput!(U)) { 451 auto alloc = newRegionAllocator(); 452 453 static if(isFloatingPoint!(TRidge[$ - 1]) || isIntegral!(TRidge[$ - 1])) { 454 // ridge param. 455 alias XRidge[$ - 1] ridgeParam; 456 alias TRidge[0..$ - 1] T; 457 alias XRidge[0..$ - 1] XIn; 458 enum bool ridge = true; 459 dstatsEnforce(ridgeParam >= 0, 460 "Cannot do ridge regerssion with ridge param <= 0."); 461 462 static SummaryIter!R summaryIter(R)(R range) { 463 return typeof(return)(range); 464 } 465 466 } else { 467 enum bool ridge = false; 468 enum ridgeParam = 0; 469 alias TRidge T; 470 alias XRidge XIn; 471 472 static R summaryIter(R)(R range) { 473 return range; 474 } 475 } 476 477 static if(isArray!(T[0]) && isInputRange!(typeof(XIn[0][0])) && 478 T.length == 1) { 479 auto X = alloc.array(map!(summaryIter)(XIn[0])); 480 alias typeof(X[0]) E; 481 } else { 482 static if(ridge) { 483 alias staticMap!(SummaryType, T) XType; 484 XType X; 485 486 foreach(ti, elem; XIn) { 487 X[ti] = summaryIter(elem); 488 } 489 } else { 490 alias XIn X; 491 } 492 } 493 494 DoubleMatrix xTx; 495 double[] xTy = alloc.uninitializedArray!(double[])(X.length); 496 double[] ret; 497 if(buf.length < X.length) { 498 ret = new double[X.length]; 499 } else { 500 ret = buf[0..X.length]; 501 } 502 503 rangeMatrixMulTrans(xTy, xTx, Y, X, alloc); 504 505 static if(ridge) { 506 if(ridgeParam > 0) { 507 foreach(i, range; X) { 508 // The whole matrix is scaled by N, as well as the xTy vector, 509 // so scale the ridge param similarly. 510 xTx[i, i] += ridgeParam * range.mse / range.N; 511 } 512 } 513 } 514 515 choleskySolve(xTx, xTy, ret); 516 return ret; 517 } 518 519 /** 520 Perform a linear regression as in linearRegressBeta, but return a 521 RegressRes with useful stuff for statistical inference. If the last element 522 of input is a real, this is used to specify the confidence intervals to 523 be calculated. Otherwise, the default of 0.95 is used. The rest of input 524 should be the elements of X. 525 526 When using this function, which provides several useful statistics useful 527 for inference, each range must be traversed twice. This means: 528 529 1. They have to be forward ranges, not input ranges. 530 531 2. If you have a large amount of data and you're mapping it to some 532 expensive function, you may want to do this eagerly instead of lazily. 533 534 Notes: 535 536 The X ranges are traversed in lockstep, but the traversal is stopped 537 at the end of the shortest one. Therefore, using infinite ranges is safe. 538 For example, using repeat(1) to get an intercept term works. 539 540 If the confidence interval specified is exactly 0, this is treated as a 541 special case and confidence interval calculation is skipped. This can speed 542 things up significantly and therefore can be useful in monte carlo and possibly 543 data mining contexts. 544 545 Bugs: The statistical tests performed in this function assume that an 546 intercept term is included in your regression model. If no intercept term 547 is included, the P-values, confidence intervals and adjusted R^2 values 548 calculated by this function will be wrong. 549 550 Examples: 551 --- 552 int[] nBeers = [8,6,7,5,3,0,9]; 553 int[] nCoffees = [3,6,2,4,3,6,8]; 554 int[] musicVolume = [3,1,4,1,5,9,2]; 555 int[] programmingSkill = [2,7,1,8,2,8,1]; 556 557 // Using default confidence interval: 558 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees, 559 musicVolume, map!"a * a"(musicVolume)); 560 561 // Using user-specified confidence interval: 562 auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees, 563 musicVolume, map!"a * a"(musicVolume), 0.8675309); 564 --- 565 */ 566 RegressRes linearRegress(U, TC...)(U Y, TC input) { 567 static if(is(TC[$ - 1] : double)) { 568 double confLvl = input[$ - 1]; 569 enforceConfidence(confLvl); 570 alias TC[0..$ - 1] T; 571 alias input[0..$ - 1] XIn; 572 } else { 573 double confLvl = 0.95; // Default; 574 alias TC T; 575 alias input XIn; 576 } 577 578 auto alloc = newRegionAllocator(); 579 static if(isForwardRange!(T[0]) && isForwardRange!(typeof(XIn[0].front())) && 580 T.length == 1) { 581 582 enum bool arrayX = true; 583 alias typeof(XIn[0].front) E; 584 E[] X = alloc.array(XIn[0]); 585 } else static if(allSatisfy!(isForwardRange, T)) { 586 enum bool arrayX = false; 587 alias XIn X; 588 } else { 589 static assert(0, "Linear regression can only be performed with " ~ 590 "tuples of forward ranges or ranges of forward ranges."); 591 } 592 593 auto xTx = doubleMatrix(X.length, X.length, alloc); 594 auto xTxNeg1 = doubleMatrix(X.length, X.length, alloc); 595 double[] xTy = alloc.uninitializedArray!(double[])(X.length); 596 597 static if(arrayX) { 598 auto xSaved = alloc.array(X); 599 foreach(ref elem; xSaved) { 600 elem = elem.save; 601 } 602 } else { 603 auto xSaved = X; 604 foreach(ti, Type; X) { 605 xSaved[ti] = X[ti].save; 606 } 607 } 608 609 rangeMatrixMulTrans(xTy, xTx, Y.save, X, alloc); 610 invert(xTx, xTxNeg1); 611 auto betas = new double[X.length]; 612 foreach(i; 0..betas.length) { 613 betas[i] = 0; 614 foreach(j; 0..betas.length) { 615 betas[i] += xTxNeg1[i, j] * xTy[j]; 616 } 617 } 618 619 X = xSaved; 620 auto residuals = residuals(betas, Y, X); 621 double S = 0; 622 ulong n = 0; 623 PearsonCor R2Calc; 624 for(; !residuals.empty; residuals.popFront) { 625 immutable residual = residuals.front; 626 S += residual * residual; 627 immutable Yfront = residuals.Y.front; 628 immutable predicted = Yfront - residual; 629 R2Calc.put(predicted, Yfront); 630 n++; 631 } 632 immutable ulong df = n - X.length; 633 immutable double R2 = R2Calc.cor ^^ 2; 634 immutable double adjustedR2 = 1.0L - (1.0L - R2) * ((n - 1.0L) / df); 635 636 immutable double sigma2 = S / (n - X.length); 637 638 double[] stdErr = new double[betas.length]; 639 foreach(i, ref elem; stdErr) { 640 elem = sqrt( S * xTxNeg1[i, i] / df / n); 641 } 642 643 double[] lowerBound, upperBound; 644 if(confLvl == 0) { 645 // Then we're going to skip the computation to save time. (See below.) 646 lowerBound = betas; 647 upperBound = betas; 648 } else { 649 lowerBound = new double[betas.length]; 650 upperBound = new double[betas.length]; 651 } 652 auto p = new double[betas.length]; 653 654 foreach(i, beta; betas) { 655 try { 656 p[i] = 2 * min(studentsTCDF(beta / stdErr[i], df), 657 studentsTCDFR(beta / stdErr[i], df)); 658 } catch(DstatsArgumentException) { 659 // Leave it as a NaN. 660 } 661 662 if(confLvl > 0) { 663 // Skip confidence level computation if level is zero, to save 664 // computation time. This is important in monte carlo and possibly 665 // data mining contexts. 666 try { 667 double delta = invStudentsTCDF(0.5 * (1 - confLvl), df) * 668 stdErr[i]; 669 upperBound[i] = beta - delta; 670 lowerBound[i] = beta + delta; 671 } catch(DstatsArgumentException) { 672 // Leave confidence bounds as NaNs. 673 } 674 } 675 } 676 677 double F = (R2 / (X.length - 1)) / ((1 - R2) / (n - X.length)); 678 double overallP; 679 try { 680 overallP = fisherCDFR(F, X.length - 1, n - X.length); 681 } catch(DstatsArgumentException) { 682 // Leave it as a NaN. 683 } 684 685 return RegressRes(betas, stdErr, lowerBound, upperBound, p, R2, 686 adjustedR2, sqrt(sigma2), overallP); 687 } 688 689 690 /**Struct returned by polyFit.*/ 691 struct PolyFitRes(T) { 692 693 /**The array of PowMap ranges created by polyFit.*/ 694 T X; 695 696 /**The rest of the results. This is alias this'd.*/ 697 RegressRes regressRes; 698 alias regressRes this; 699 } 700 701 /**Convenience function that takes a forward range X and a forward range Y, 702 * creates an array of PowMap structs for integer powers from 0 through N, 703 * and calls linearRegressBeta. 704 * 705 * Returns: An array of doubles. The index of each element corresponds to 706 * the exponent. For example, the X<sup>2</sup> term will have an index of 707 * 2. 708 */ 709 double[] polyFitBeta(T, U)(U Y, T X, uint N, double ridge = 0) { 710 double[] dummy; 711 return polyFitBetaBuf!(T, U)(dummy, Y, X, N); 712 } 713 714 /**Same as polyFitBeta, but allows the caller to provide an explicit buffer 715 * to return the coefficients in. If it's too short, a new one will be 716 * allocated. Otherwise, results will be returned in the user-provided buffer. 717 */ 718 double[] polyFitBetaBuf(T, U)(double[] buf, U Y, T X, uint N, double ridge = 0) { 719 auto alloc = newRegionAllocator(); 720 auto pows = alloc.uninitializedArray!(PowMap!(uint, T)[])(N + 1); 721 foreach(exponent; 0..N + 1) { 722 pows[exponent] = powMap(X, exponent); 723 } 724 725 if(ridge == 0) { 726 return linearRegressBetaBuf(buf, Y, pows); 727 } else { 728 return linearRegressBetaBuf(buf, Y, pows, ridge); 729 } 730 } 731 732 /**Convenience function that takes a forward range X and a forward range Y, 733 * creates an array of PowMap structs for integer powers 0 through N, 734 * and calls linearRegress. 735 * 736 * Returns: A PolyFitRes containing the array of PowMap structs created and 737 * a RegressRes. The PolyFitRes is alias this'd to the RegressRes.*/ 738 PolyFitRes!(PowMap!(uint, T)[]) 739 polyFit(T, U)(U Y, T X, uint N, double confInt = 0.95) { 740 enforceConfidence(confInt); 741 auto pows = new PowMap!(uint, T)[N + 1]; 742 foreach(exponent; 0..N + 1) { 743 pows[exponent] = powMap(X, exponent); 744 } 745 alias PolyFitRes!(typeof(pows)) RT; 746 RT ret; 747 ret.X = pows; 748 ret.regressRes = linearRegress(Y, pows, confInt); 749 return ret; 750 } 751 752 unittest { 753 // These are a bunch of values gleaned from various examples on the Web. 754 double[] heights = [1.47,1.5,1.52,1.55,1.57,1.60,1.63,1.65,1.68,1.7,1.73,1.75, 755 1.78,1.8,1.83]; 756 double[] weights = [52.21,53.12,54.48,55.84,57.2,58.57,59.93,61.29,63.11,64.47, 757 66.28,68.1,69.92,72.19,74.46]; 758 float[] diseaseSev = [1.9, 3.1, 3.3, 4.8, 5.3, 6.1, 6.4, 7.6, 9.8, 12.4]; 759 ubyte[] temperature = [2,1,5,5,20,20,23,10,30,25]; 760 761 // Values from R. 762 auto res1 = polyFit(diseaseSev, temperature, 1); 763 assert(approxEqual(res1.betas[0], 2.6623)); 764 assert(approxEqual(res1.betas[1], 0.2417)); 765 assert(approxEqual(res1.stdErr[0], 1.1008)); 766 assert(approxEqual(res1.stdErr[1], 0.0635)); 767 assert(approxEqual(res1.p[0], 0.0419)); 768 assert(approxEqual(res1.p[1], 0.0052)); 769 assert(approxEqual(res1.R2, 0.644)); 770 assert(approxEqual(res1.adjustedR2, 0.6001)); 771 assert(approxEqual(res1.residualError, 2.03)); 772 assert(approxEqual(res1.overallP, 0.00518)); 773 774 775 auto res2 = polyFit(weights, heights, 2); 776 assert(approxEqual(res2.betas[0], 128.813)); 777 assert(approxEqual(res2.betas[1], -143.162)); 778 assert(approxEqual(res2.betas[2], 61.960)); 779 780 assert(approxEqual(res2.stdErr[0], 16.308)); 781 assert(approxEqual(res2.stdErr[1], 19.833)); 782 assert(approxEqual(res2.stdErr[2], 6.008)); 783 784 assert(approxEqual(res2.p[0], 4.28e-6)); 785 assert(approxEqual(res2.p[1], 1.06e-5)); 786 assert(approxEqual(res2.p[2], 2.57e-7)); 787 788 assert(approxEqual(res2.R2, 0.9989, 0.0001)); 789 assert(approxEqual(res2.adjustedR2, 0.9987, 0.0001)); 790 791 assert(approxEqual(res2.lowerBound[0], 92.9, 0.01)); 792 assert(approxEqual(res2.lowerBound[1], -186.8, 0.01)); 793 assert(approxEqual(res2.lowerBound[2], 48.7, 0.01)); 794 assert(approxEqual(res2.upperBound[0], 164.7, 0.01)); 795 assert(approxEqual(res2.upperBound[1], -99.5, 0.01)); 796 assert(approxEqual(res2.upperBound[2], 75.2, 0.01)); 797 798 auto res3 = linearRegress(weights, repeat(1), heights, map!"a * a"(heights)); 799 assert(res2.betas == res3.betas); 800 801 double[2] beta1Buf; 802 auto beta1 = linearRegressBetaBuf 803 (beta1Buf[], diseaseSev, repeat(1), temperature); 804 assert(beta1Buf.ptr == beta1.ptr); 805 assert(beta1Buf[] == beta1[]); 806 assert(approxEqual(beta1, res1.betas)); 807 auto beta2 = polyFitBeta(weights, heights, 2); 808 assert(approxEqual(beta2, res2.betas)); 809 810 auto res4 = linearRegress(weights, repeat(1), heights); 811 assert(approxEqual(res4.p, 3.604e-14)); 812 assert(approxEqual(res4.betas, [-39.062, 61.272])); 813 assert(approxEqual(res4.p, [6.05e-9, 3.60e-14])); 814 assert(approxEqual(res4.R2, 0.9892)); 815 assert(approxEqual(res4.adjustedR2, 0.9884)); 816 assert(approxEqual(res4.residualError, 0.7591)); 817 assert(approxEqual(res4.lowerBound, [-45.40912, 57.43554])); 818 assert(approxEqual(res4.upperBound, [-32.71479, 65.10883])); 819 820 // Test residuals. 821 assert(approxEqual(residuals(res4.betas, weights, repeat(1), heights), 822 [1.20184170, 0.27367611, 0.40823237, -0.06993322, 0.06462305, 823 -0.40354255, -0.88170814, -0.74715188, -0.76531747, -0.63076120, 824 -0.65892680, -0.06437053, -0.08253613, 0.96202014, 1.39385455])); 825 826 // Test nonzero ridge parameters. 827 // Values from R's MASS package. 828 auto a = [1, 2, 3, 4, 5, 6, 7]; 829 auto b = [8, 6, 7, 5, 3, 0, 9]; 830 auto c = [2, 7, 1, 8, 2, 8, 1]; 831 832 // With a ridge param. of zero, ridge regression reduces to regular 833 // OLS regression. 834 assert(approxEqual(linearRegressBeta(a, repeat(1), b, c, 0), 835 linearRegressBeta(a, repeat(1), b, c))); 836 837 // Test the ridge regression. Values from R MASS package. 838 auto ridge1 = linearRegressBeta(a, repeat(1), b, c, 1); 839 auto ridge2 = linearRegressBeta(a, repeat(1), b, c, 2); 840 auto ridge3 = linearRegressBeta(c, [[1,1,1,1,1,1,1], a, b], 10); 841 assert(approxEqual(ridge1, [6.0357757, -0.2729671, -0.1337131])); 842 assert(approxEqual(ridge2, [5.62367784, -0.22449854, -0.09775174])); 843 assert(approxEqual(ridge3, [5.82653624, -0.05197246, -0.27185592 ])); 844 } 845 846 private MeanSD[] calculateSummaries(X...)(X xIn, RegionAllocator alloc) { 847 // This is slightly wasteful because it sticks this shallow dup in 848 // an unfreeable pos on TempAlloc. 849 static if(X.length == 1 && isRoR!(X[0])) { 850 auto ret = alloc.uninitializedArray!(MeanSD[])(xIn[0].length); 851 auto alloc2 = newRegionAllocator(); 852 auto x = alloc2.array(xIn[0]); 853 854 foreach(ref range; x) { 855 range = range.save; 856 } 857 858 enum allHaveLength = hasLength!(ElementType!(typeof(x))); 859 860 } else { 861 auto ret = alloc2.uninitializedArray!MeanSD(xIn.length); 862 alias xIn x; 863 864 foreach(ti, R; X) { 865 x[ti] = x[ti].save; 866 } 867 868 enum allHaveLength = allSatisfy!(hasLength, X); 869 } 870 871 ret[] = MeanSD.init; 872 873 static if(allHaveLength) { 874 size_t minLen = size_t.max; 875 foreach(range; x) { 876 minLen = min(minLen, range.length); 877 } 878 879 foreach(i, range; x) { 880 ret[i] = meanStdev(take(range, minLen)); 881 } 882 883 } else { 884 bool someEmpty() { 885 foreach(range; x) { 886 if(range.empty) return true; 887 } 888 889 return false; 890 } 891 892 void popAll() { 893 foreach(ti, elem; x) { 894 x[ti].popFront(); 895 } 896 } 897 898 while(!someEmpty) { 899 foreach(i, range; x) { 900 ret[i].put(range.front); 901 } 902 popAll(); 903 } 904 } 905 906 return ret; 907 } 908 909 private double softThresh(double z, double gamma) { 910 if(gamma >= abs(z)) { 911 return 0; 912 } else if(z > 0) { 913 return z - gamma; 914 } else { 915 return z + gamma; 916 } 917 } 918 919 private struct PreprocessedData { 920 MeanSD[] xSumm; 921 MeanSD ySumm; 922 double[] y; 923 double[][] x; 924 } 925 926 private PreprocessedData preprocessStandardize(Y, X...) 927 (Y yIn, X xIn, RegionAllocator alloc) { 928 static if(X.length == 1 && isRoR!(X[0])) { 929 auto xRaw = alloc.array(xIn[0]); 930 } else { 931 alias xIn xRaw; 932 } 933 934 auto summaries = calculateSummaries(xRaw, alloc); 935 immutable minLen = to!size_t( 936 reduce!min( 937 map!"a.N"(summaries) 938 ) 939 ); 940 941 auto x = alloc.uninitializedArray!(double[][])(summaries.length); 942 foreach(i, range; xRaw) { 943 x[i] = alloc.array(map!(to!double)(take(range, minLen))); 944 x[i][] -= summaries[i].mean; 945 x[i][] /= sqrt(summaries[i].mse); 946 } 947 948 double[] y; 949 MeanSD ySumm; 950 if(yIn.length) { 951 y = alloc.array(map!(to!double)(take(yIn, minLen))); 952 ySumm = meanStdev(y); 953 y[] -= ySumm.mean; 954 } 955 956 return PreprocessedData(summaries, ySumm, y, x); 957 } 958 959 /** 960 Performs lasso (L1) and/or ridge (L2) penalized linear regression. Due to the 961 way the data is standardized, no intercept term should be included in x 962 (unlike linearRegress and linearRegressBeta). The intercept coefficient is 963 implicitly included and returned in the first element of the returned array. 964 Usage is otherwise identical. 965 966 Note: Setting lasso equal to zero is equivalent to performing ridge regression. 967 This can also be done with linearRegressBeta. However, the 968 linearRegressBeta algorithm is optimized for memory efficiency and 969 large samples. This algorithm is optimized for large feature sets. 970 971 Returns: The beta coefficients for the regression model. 972 973 References: 974 975 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat. 976 2007;2:302-332. 977 978 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model. 979 Biometrical Journal 52(1), 70{84. 980 981 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of 982 microarray data with penalized logistic regression. Proceedings of SPIE. 983 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198 984 985 Douglas M. Hawkins and Xiangrong Yin. A faster algorithm for ridge regression 986 of reduced rank data. Computational Statistics & Data Analysis Volume 40, 987 Issue 2, 28 August 2002, Pages 253-262 988 989 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula 990 */ 991 double[] linearRegressPenalized(Y, X...) 992 (Y yIn, X xIn, double lasso, double ridge) { 993 auto alloc = newRegionAllocator(); 994 995 auto preproc = preprocessStandardize(yIn, xIn, alloc); 996 997 auto summaries = preproc.xSumm; 998 auto ySumm = preproc.ySumm; 999 auto x = preproc.x; 1000 auto y = preproc.y; 1001 1002 auto betasFull = new double[x.length + 1]; 1003 betasFull[] = 0; 1004 auto betas = betasFull[1..$]; 1005 1006 if(lasso > 0) { 1007 coordDescent(y, x, betas, lasso, ridge, null); 1008 } else if(y.length > x.length) { 1009 // Correct for different )#*$# scaling conventions. 1010 foreach(i, feature; x) { 1011 feature[] /= sqrt(summaries[i].mse); 1012 } 1013 1014 linearRegressBetaBuf(betas, y, x, ridge); 1015 1016 // More correction for diff. scaling factors. 1017 foreach(i, ref b; betas) { 1018 b /= sqrt(summaries[i].mse); 1019 } 1020 1021 } else { 1022 ridgeLargeP(y, x, ridge, betas, null); 1023 } 1024 1025 foreach(i, ref elem; betas) { 1026 elem /= sqrt(summaries[i].mse); 1027 } 1028 1029 betasFull[0] = ySumm.mean; 1030 foreach(i, beta; betas) { 1031 betasFull[0] -= beta * summaries[i].mean; 1032 } 1033 1034 return betasFull; 1035 } 1036 1037 private void coordDescent( 1038 double[] y, 1039 double[][] x, 1040 double[] betas, 1041 double lasso, 1042 double ridge, 1043 double[] w 1044 ) { 1045 auto alloc = newRegionAllocator(); 1046 1047 auto predictions = alloc.uninitializedArray!(double[])(y.length); 1048 predictions[] = 0; 1049 1050 void makePredictions() { 1051 foreach(j, beta; betas) { 1052 predictions[] += x[j][] * beta; 1053 } 1054 } 1055 1056 if(reduce!max(0.0, map!abs(betas)) > 0) { 1057 makePredictions(); 1058 } 1059 1060 auto residuals = alloc.uninitializedArray!(double[])(y.length); 1061 1062 uint iter = 0; 1063 enum maxIter = 10_000; 1064 enum relEpsilon = 1e-5; 1065 enum absEpsilon = 1e-10; 1066 immutable n = cast(double) y.length; 1067 auto perm = alloc.array(iota(0U, x.length)); 1068 1069 auto weightDots = alloc.uninitializedArray!(double[])(x.length); 1070 if(w.length == 0) { 1071 ridge /= n; 1072 lasso /= n; 1073 weightDots[] = 1; 1074 } else { 1075 foreach(j, col; x) { 1076 weightDots[j] = dotProduct(w, map!"a * a"(col)); 1077 } 1078 } 1079 1080 double doIter(double[] betas, double[][] x) { 1081 double maxRelError = 0; 1082 bool predictionsChanged = true; 1083 1084 foreach(j, ref b; betas) { 1085 if(b == 0) { 1086 if(predictionsChanged) { 1087 residuals[] = y[] - predictions[]; 1088 } 1089 } else { 1090 predictions[] -= x[j][] * b; 1091 residuals[] = y[] - predictions[]; 1092 } 1093 1094 double z; 1095 if(w.length) { 1096 z = 0; 1097 foreach(i, weight; w) { 1098 z += weight * x[j][i] * residuals[i]; 1099 } 1100 } else { 1101 z = dotProduct(residuals, x[j]) / n; 1102 } 1103 1104 immutable newB = softThresh(z, lasso) / (weightDots[j] + ridge); 1105 immutable absErr = abs(b - newB); 1106 immutable err = absErr / max(abs(b), abs(newB)); 1107 1108 if(absErr > absEpsilon) { 1109 maxRelError = max(maxRelError, err); 1110 } 1111 1112 if(b == 0 && newB == 0) { 1113 predictionsChanged = false; 1114 } else { 1115 predictionsChanged = true; 1116 } 1117 1118 b = newB; 1119 if(b != 0) { 1120 predictions[] += x[j][] * b; 1121 } 1122 } 1123 1124 return maxRelError; 1125 } 1126 1127 void toConvergence() { 1128 double maxRelErr = doIter(betas, x); 1129 iter++; 1130 if(maxRelErr < relEpsilon) return; 1131 1132 static bool absGreater(double x, double y) { return abs(x) > abs(y); } 1133 1134 while(iter < maxIter) { 1135 size_t split = 0; 1136 while(split < betas.length && abs(betas[split]) > 0) { 1137 split++; 1138 } 1139 1140 try { 1141 qsort!absGreater(betas, x, perm, weightDots); 1142 } catch(SortException) { 1143 betas[] = double.nan; 1144 break; 1145 } 1146 1147 maxRelErr = double.infinity; 1148 for(; !(maxRelErr < relEpsilon) && split < betas.length 1149 && iter < maxIter; iter++) { 1150 maxRelErr = doIter(betas[0..split], x[0..split]); 1151 } 1152 1153 maxRelErr = doIter(betas, x); 1154 iter++; 1155 if(maxRelErr < relEpsilon) break; 1156 } 1157 } 1158 1159 toConvergence(); 1160 try { 1161 qsort(perm, x, betas); 1162 } catch(SortException) { 1163 return; 1164 } 1165 } 1166 1167 /* 1168 Ridge regression, case where P > N, where P is number of features and N 1169 is number of samples. 1170 */ 1171 private void ridgeLargeP( 1172 const(double)[] y, 1173 const double[][] x, 1174 immutable double lambda, 1175 double[] betas, 1176 const double[] w = null 1177 ) { 1178 static if(haveSvd) { 1179 return eilersRidge(y, x, lambda, betas, w); 1180 } else { 1181 return shermanMorrisonRidge(y, x, lambda, betas, w); 1182 } 1183 } 1184 1185 version(scid) { 1186 version(nodeps) { 1187 enum haveSvd = false; 1188 } else { 1189 enum haveSvd = true; 1190 } 1191 } else { 1192 enum haveSvd = false; 1193 } 1194 1195 /* 1196 An implementation of ridge regression for large dimension. Taken from: 1197 1198 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of 1199 microarray data with penalized logistic regression. Proceedings of SPIE. 1200 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198 1201 1202 This algorithm is very fast, O(N^2 * P) but requires singular value 1203 decomposition. Therefore, we only use if we're using SciD with full 1204 dependency support. 1205 */ 1206 static if(haveSvd) private void eilersRidge( 1207 const(double)[] yArr, 1208 const double[][] x, 1209 immutable double lambda, 1210 double[] betas, 1211 const double[] weightArr 1212 ) { 1213 if(x.length == 0) return; 1214 auto alloc = newRegionAllocator(); 1215 immutable n = x[0].length; 1216 immutable p = x.length; 1217 1218 auto xMat = ExternalMatrixView!double(n, p, alloc); 1219 foreach(i; 0..n) foreach(j; 0..p) { 1220 xMat[i, j] = x[j][i]; 1221 } 1222 1223 typeof(svdDestructive(xMat, SvdType.economy, alloc)) svdRes; 1224 try { 1225 svdRes = svdDestructive(xMat, SvdType.economy, alloc); 1226 } catch(Exception) { 1227 betas[] = double.nan; 1228 return; 1229 } 1230 1231 auto us = eval(svdRes.u * svdRes.s, alloc); 1232 ExternalMatrixView!double usWus; 1233 ExternalVectorView!double usWy; 1234 1235 // Have to cast away const because ExternalVectorView doesn't play 1236 // nice w/ it yet. 1237 auto y = ExternalVectorView!double(cast(double[]) yArr); 1238 1239 if(weightArr.length) { 1240 // Once we've multiplied s by u, we don't need it anymore. Overwrite 1241 // its contents with the weight array to get weights in the form of 1242 // a diagonal matrix. 1243 auto w = svdRes.s; 1244 svdRes.s = typeof(svdRes.s).init; 1245 1246 foreach(i, weight; weightArr) { 1247 w[i, i] = weight; 1248 } 1249 1250 usWus = eval(us.t * w * us, alloc); 1251 usWy = eval(us.t * w * y, alloc); 1252 } else { 1253 usWus = eval(us.t * us, alloc); 1254 usWy = eval(us.t * y, alloc); 1255 } 1256 1257 assert(usWus.rows == usWus.columns); 1258 assert(usWus.rows == n); 1259 foreach(i; 0..n) { 1260 usWus[i, i] += lambda; 1261 } 1262 1263 auto theta = eval(inv(usWus) * usWy, alloc); 1264 1265 // Work around weird SciD bug by transposing matrix manually. 1266 auto v = ExternalMatrixView!(double, StorageOrder.RowMajor)( 1267 svdRes.vt.columns, 1268 svdRes.vt.data[0..svdRes.vt.rows * svdRes.vt.columns] 1269 ); 1270 eval(v * theta, betas); 1271 } 1272 1273 /* 1274 This algorithm is used as a fallback in case we don't have SVD available. 1275 It's O(N P^2) instead of O(N^2 * P). It was adapted from the 1276 following paper: 1277 1278 Douglas M. Hawkins and Xiangrong Yin. A faster algorithm for ridge regression 1279 of reduced rank data. Computational Statistics & Data Analysis Volume 40, 1280 Issue 2, 28 August 2002, Pages 253-262 1281 1282 It also uses Wikipedia's page on Sherman-Morrison: 1283 1284 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula 1285 1286 I simplified it by only doing rank-1 updates of inv(x' * x * lambda * I), 1287 since I'm just trying to calculate the ridge estimate, not do efficient 1288 cross-validation. The idea is to start with an empty dataset. Here, 1289 inv(x' * x + lambda * I) = 1 / lambda * I. Then, add each sample (row of x) 1290 sequentially. This is equivalent to adding s * s' to (x' * x + lambda * I) 1291 where s is the vector of predictors for the sample. This is known as a 1292 dyadic product. If p is the number of features/dimensions, and n is the 1293 number of samples, we have n updates, each of which is O(P^2) for an 1294 O(N * P^2) algorithm. Naive algoriths would be O(P^3). 1295 */ 1296 static if(!haveSvd) 1297 private void shermanMorrisonRidge( 1298 const(double)[] y, 1299 const(double[])[] x, // Column major 1300 immutable double lambda, 1301 double[] betas, 1302 const(double)[] w 1303 ) in { 1304 assert(lambda > 0); 1305 foreach(col; x) assert(col.length == x[0].length); 1306 if(x.length) assert(y.length == x[0].length); 1307 } body { 1308 auto alloc = newRegionAllocator(); 1309 immutable p = x.length; 1310 if(p == 0) return; 1311 immutable n = x[0].length; 1312 1313 auto v = alloc.uninitializedArray!(double[])(p); 1314 double[] u; 1315 if(w.length) { 1316 u = alloc.uninitializedArray!(double[])(p); 1317 } else { 1318 u = v; 1319 } 1320 1321 auto vTxTxNeg1 = alloc.uninitializedArray!(double[])(p); 1322 auto xTxNeg1u = alloc.uninitializedArray!(double[])(p); 1323 1324 // Before any updates are done, x'x = I * lambda, so inv(x'x) = I / lambda. 1325 auto xTxNeg1 = alloc.uninitializedArray!(double[][])(p, p); 1326 foreach(i; 0..p) foreach(j; 0..p) { 1327 xTxNeg1[i][j] = (i == j) ? (1.0 / lambda) : 0; 1328 } 1329 1330 foreach(sampleIndex; 0..n) { 1331 copy(transversal(x[], sampleIndex), v); 1332 if(w.length) { 1333 u[] = w[sampleIndex] * v[]; 1334 } 1335 1336 // Calculate denominator of update: 1 + v' * xTxNeg1 * u 1337 vTxTxNeg1[] = 0; 1338 foreach(rowIndex, row; xTxNeg1) { 1339 vTxTxNeg1[] += v[rowIndex] * row[]; 1340 } 1341 immutable denom = 1.0 + dotProduct(vTxTxNeg1, u); 1342 1343 // Calculate numerator. The parentheses indicate how the operation 1344 // is coded. This is for computational efficiency. Removing the 1345 // parentheses would be mathematically equivalent due to associativity: 1346 // (xTxNeg1 * u) * (v' * xTxNeg1). 1347 xTxNeg1u[] = 0; 1348 foreach(rowIndex, row; xTxNeg1) { 1349 xTxNeg1u[rowIndex] = dotProduct(row, u); 1350 } 1351 1352 foreach(i, row; xTxNeg1) { 1353 immutable multiplier = xTxNeg1u[i] / denom; 1354 row[] -= multiplier * vTxTxNeg1[]; 1355 } 1356 } 1357 1358 auto xTy = alloc.uninitializedArray!(double[])(p); 1359 if(w.length) { 1360 auto yw = alloc.uninitializedArray!(double[])(n); 1361 yw[] = y[] * w[]; 1362 y = yw; 1363 } 1364 1365 foreach(colIndex, col; x) { 1366 xTy[colIndex] = dotProduct(col[], y[]); 1367 } 1368 1369 foreach(rowIndex, row; xTxNeg1) { 1370 betas[rowIndex] = dotProduct(row, xTy); 1371 } 1372 } 1373 1374 unittest { 1375 // Test ridge regression. We have three impls for all kinds of diff. 1376 // scenarios. See if they all agree. Note that the ridiculously small but 1377 // nonzero lasso param is to force the use of the coord descent algo. 1378 auto y = new double[12]; 1379 auto x = new double[][16]; 1380 foreach(ref elem; x) elem = new double[12]; 1381 x[0][] = 1; 1382 auto gen = Random(31415); // For random but repeatable results. 1383 1384 foreach(iter; 0..1000) { 1385 foreach(col; x[1..$]) foreach(ref elem; col) elem = rNormal(0, 1, gen); 1386 foreach(ref elem; y) elem = rNormal(0, 1, gen); 1387 immutable ridge = uniform(0.1, 10.0, gen); 1388 1389 auto normalEq = linearRegressBeta(y, x, ridge); 1390 auto coordDescent = linearRegressPenalized( 1391 y, x[1..$], double.min_normal, ridge); 1392 auto linalgTrick = linearRegressPenalized(y, x[1..$], 0, ridge); 1393 1394 // Every once in a blue moon coordinate descent doesn't converge that 1395 // well. These small errors are of no practical significance, hence 1396 // the wide tolerance. However, if the direct normal equations 1397 // and linalg trick don't agree extremely closely, then something's 1398 // fundamentally wrong. 1399 assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text( 1400 normalEq, coordDescent)); 1401 assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text( 1402 linalgTrick, coordDescent)); 1403 assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text( 1404 normalEq, linalgTrick)); 1405 } 1406 1407 // Test stuff that's got some lasso in it. Values from R's Penalized 1408 // package. 1409 y = [1.0, 2.0, 3, 4, 5, 6, 7]; 1410 x = [[8.0, 6, 7, 5, 3, 0, 9], 1411 [3.0, 6, 2, 4, 3, 6, 8], 1412 [3.0, 1, 4, 1, 5, 9, 2], 1413 [2.0, 7, 1, 8, 2, 8, 1]]; 1414 1415 assert(approxEqual(linearRegressPenalized(y, x, 1, 0), 1416 [4.16316, -0.3603197, 0.6308278, 0, -0.2633263])); 1417 assert(approxEqual(linearRegressPenalized(y, x, 1, 3), 1418 [2.519590, -0.09116883, 0.38067757, 0.13122413, -0.05637939])); 1419 assert(approxEqual(linearRegressPenalized(y, x, 2, 0.1), 1420 [1.247235, 0, 0.4440735, 0.2023602, 0])); 1421 assert(approxEqual(linearRegressPenalized(y, x, 5, 7), 1422 [3.453787, 0, 0.10968736, 0.01253992, 0])); 1423 } 1424 1425 /** 1426 Computes a logistic regression using a maximum likelihood estimator 1427 and returns the beta coefficients. This is a generalized linear model with 1428 the link function f(XB) = 1 / (1 + exp(XB)). This is generally used to model 1429 the probability that a binary Y variable is 1 given a set of X variables. 1430 1431 For the purpose of this function, Y variables are interpreted as Booleans, 1432 regardless of their type. X may be either a range of ranges or a tuple of 1433 ranges. However, note that unlike in linearRegress, they are copied to an 1434 array if they are not random access ranges. Note that each value is accessed 1435 several times, so if your range is a map to something expensive, you may 1436 want to evaluate it eagerly. 1437 1438 If the last parameter passed in is a numeric value instead of a range, 1439 it is interpreted as a ridge parameter and ridge regression is performed. This 1440 penalizes the L2 norm of the beta vector (in a scaled space) and results 1441 in more parsimonious models. It limits the usefulness of inference techniques 1442 (p-values, confidence intervals), however, and is therefore not offered 1443 in logisticRegres(). 1444 1445 If no ridge parameter is passed, or equivalenty if the ridge parameter is 1446 zero, then ordinary maximum likelihood regression is performed. 1447 1448 Note that, while this implementation of ridge regression was tested against 1449 the R Design Package implementation, it uses slightly different conventions 1450 that make the results not comparable without transformation. dstats uses a 1451 biased estimate of the variance to scale the beta vector penalties, while 1452 Design uses an unbiased estimate. Furthermore, Design penalizes by 1/2 of the 1453 L2 norm, whereas dstats penalizes by the L2 norm. Therefore, if n is the 1454 sample size, and lambda is the penalty used with dstats, the proper penalty 1455 to use in Design to get the same results is 2 * (n - 1) * lambda / n. 1456 1457 Also note that, as in linearRegress, repeat(1) can be used for the intercept 1458 term. 1459 1460 Returns: The beta coefficients for the regression model. 1461 1462 References: 1463 1464 http://en.wikipedia.org/wiki/Logistic_regression 1465 1466 http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf 1467 1468 S. Le Cessie and J. C. Van Houwelingen. Ridge Estimators in Logistic 1469 Regression. Journal of the Royal Statistical Society. Series C 1470 (Applied Statistics), Vol. 41, No. 1(1992), pp. 191-201 1471 1472 Frank E Harrell Jr (2009). Design: Design Package. R package version 2.3-0. 1473 http://CRAN.R-project.org/package=Design 1474 */ 1475 double[] logisticRegressBeta(T, U...)(T yIn, U xRidge) { 1476 static if(isFloatingPoint!(U[$ - 1]) || isIntegral!(U[$ - 1])) { 1477 alias xRidge[$ - 1] ridge; 1478 alias xRidge[0..$ - 1] xIn; 1479 } else { 1480 enum double ridge = 0.0; 1481 alias xRidge xIn; 1482 } 1483 1484 return logisticRegressImpl(false, ridge, yIn, xIn).betas; 1485 } 1486 1487 /** 1488 Plain old data struct to hold the results of a logistic regression. 1489 */ 1490 struct LogisticRes { 1491 /**The coefficients, one for each range in X. These will be in the order 1492 * that the X ranges were passed in.*/ 1493 double[] betas; 1494 1495 /**The standard error terms of the X ranges passed in.*/ 1496 double[] stdErr; 1497 1498 /** 1499 The Wald lower confidence bounds of the beta terms, at the confidence level 1500 specificied. (Default 0.95).*/ 1501 double[] lowerBound; 1502 1503 /** 1504 The Wald upper confidence bounds of the beta terms, at the confidence level 1505 specificied. (Default 0.95).*/ 1506 double[] upperBound; 1507 1508 /** 1509 The P-value for the alternative that the corresponding beta value is 1510 different from zero against the null that it is equal to zero. These 1511 are calculated using the Wald Test.*/ 1512 double[] p; 1513 1514 /** 1515 The log likelihood for the null model. 1516 */ 1517 double nullLogLikelihood; 1518 1519 /** 1520 The log likelihood for the model fit. 1521 */ 1522 double logLikelihood; 1523 1524 /** 1525 Akaike Information Criterion, which is a complexity-penalized goodness- 1526 of-fit score, equal to 2 * k - 2 log(L) where L is the log likelihood and 1527 k is the number of parameters. 1528 */ 1529 double aic() const pure nothrow @property @safe { 1530 return 2 * (betas.length - logLikelihood); 1531 } 1532 1533 /** 1534 The P-value for the model as a whole, based on the likelihood ratio test. 1535 The null here is that the model has no predictive value, the alternative 1536 is that it does have predictive value.*/ 1537 double overallP; 1538 1539 // Just used internally. 1540 private static string arrStr(T)(T arr) { 1541 return text(arr)[1..$ - 1]; 1542 } 1543 1544 /**Print out the results in the default format.*/ 1545 string toString() { 1546 return "Betas: " ~ arrStr(betas) ~ "\nLower Conf. Int.: " ~ 1547 arrStr(lowerBound) ~ "\nUpper Conf. Int.: " ~ arrStr(upperBound) ~ 1548 "\nStd. Err: " ~ arrStr(stdErr) ~ "\nP Values: " ~ arrStr(p) ~ 1549 "\nNull Log Likelihood: " ~ text(nullLogLikelihood) ~ 1550 "\nLog Likelihood: " ~ text(logLikelihood) ~ 1551 "\nAIC: " ~ text(aic) ~ 1552 "\nOverall P: " ~ text(overallP); 1553 } 1554 } 1555 1556 /** 1557 Similar to logisticRegressBeta, but returns a LogisticRes with useful stuff for 1558 statistical inference. If the last element of input is a floating point 1559 number instead of a range, it is used to specify the confidence interval 1560 calculated. Otherwise, the default of 0.95 is used. 1561 1562 References: 1563 1564 http://en.wikipedia.org/wiki/Wald_test 1565 http://en.wikipedia.org/wiki/Akaike_information_criterion 1566 */ 1567 LogisticRes logisticRegress(T, V...)(T yIn, V input) { 1568 return logisticRegressImpl!(T, V)(true, 0, yIn, input); 1569 } 1570 1571 unittest { 1572 // Values from R. Confidence intervals from confint.default(). 1573 // R doesn't automatically calculate likelihood ratio P-value, and reports 1574 // deviations instead of log likelihoods. Deviations are just 1575 // -2 * likelihood. 1576 alias approxEqual ae; // Save typing. 1577 1578 // Start with the basics, with X as a ror. 1579 auto y1 = [1, 0, 0, 0, 1, 0, 0]; 1580 auto x1 = [[1.0, 1 ,1 ,1 ,1 ,1 ,1], 1581 [8.0, 6, 7, 5, 3, 0, 9]]; 1582 auto res1 = logisticRegress(y1, x1); 1583 assert(ae(res1.betas[0], -0.98273)); 1584 assert(ae(res1.betas[1], 0.01219)); 1585 assert(ae(res1.stdErr[0], 1.80803)); 1586 assert(ae(res1.stdErr[1], 0.29291)); 1587 assert(ae(res1.p[0], 0.587)); 1588 assert(ae(res1.p[1], 0.967)); 1589 assert(ae(res1.aic, 12.374)); 1590 assert(ae(res1.logLikelihood, -0.5 * 8.3758)); 1591 assert(ae(res1.nullLogLikelihood, -0.5 * 8.3740)); 1592 assert(ae(res1.lowerBound[0], -4.5264052)); 1593 assert(ae(res1.lowerBound[1], -0.5618933)); 1594 assert(ae(res1.upperBound[0], 2.560939)); 1595 assert(ae(res1.upperBound[1], 0.586275)); 1596 1597 // Use tuple. 1598 auto y2 = [1,0,1,1,0,1,0,0,0,1,0,1]; 1599 auto x2_1 = [3,1,4,1,5,9,2,6,5,3,5,8]; 1600 auto x2_2 = [2,7,1,8,2,8,1,8,2,8,4,5]; 1601 auto res2 = logisticRegress(y2, repeat(1), x2_1, x2_2); 1602 assert(ae(res2.betas[0], -1.1875)); 1603 assert(ae(res2.betas[1], 0.1021)); 1604 assert(ae(res2.betas[2], 0.1603)); 1605 assert(ae(res2.stdErr[0], 1.5430)); 1606 assert(ae(res2.stdErr[1], 0.2507)); 1607 assert(ae(res2.stdErr[2], 0.2108)); 1608 assert(ae(res2.p[0], 0.442)); 1609 assert(ae(res2.p[1], 0.684)); 1610 assert(ae(res2.p[2], 0.447)); 1611 assert(ae(res2.aic, 21.81)); 1612 assert(ae(res2.nullLogLikelihood, -0.5 * 16.636)); 1613 assert(ae(res2.logLikelihood, -0.5 * 15.810)); 1614 assert(ae(res2.lowerBound[0], -4.2116584)); 1615 assert(ae(res2.lowerBound[1], -0.3892603)); 1616 assert(ae(res2.lowerBound[2], -0.2528110)); 1617 assert(ae(res2.upperBound[0], 1.8366823)); 1618 assert(ae(res2.upperBound[1], 0.5934631)); 1619 assert(ae(res2.upperBound[2], 0.5733693)); 1620 1621 auto x2Intercept = [1,1,1,1,1,1,1,1,1,1,1,1]; 1622 auto res2a = logisticRegress(y2, 1623 filter!"a.length"([x2Intercept, x2_1, x2_2])); 1624 foreach(ti, elem; res2a.tupleof) { 1625 assert(ae(elem, res2.tupleof[ti])); 1626 } 1627 1628 // Use a huge range of values to test numerical stability. 1629 1630 // The filter is to make y3 a non-random access range. 1631 auto y3 = filter!"a < 2"([1,1,1,1,0,0,0,0]); 1632 auto x3_1 = filter!"a > 0"([1, 1e10, 2, 2e10, 3, 3e15, 4, 4e7]); 1633 auto x3_2 = [1e8, 1e6, 1e7, 1e5, 1e3, 1e0, 1e9, 1e11]; 1634 auto x3_3 = [-5e12, 5e2, 6e5, 4e3, -999999, -666, -3e10, -2e10]; 1635 auto res3 = logisticRegress(y3, repeat(1), x3_1, x3_2, x3_3, 0.99); 1636 assert(ae(res3.betas[0], 1.115e0)); 1637 assert(ae(res3.betas[1], -4.674e-15)); 1638 assert(ae(res3.betas[2], -7.026e-9)); 1639 assert(ae(res3.betas[3], -2.109e-12)); 1640 assert(ae(res3.stdErr[0], 1.158)); 1641 assert(ae(res3.stdErr[1], 2.098e-13)); 1642 assert(ae(res3.stdErr[2], 1.878e-8)); 1643 assert(ae(res3.stdErr[3], 4.789e-11)); 1644 assert(ae(res3.p[0], 0.336)); 1645 assert(ae(res3.p[1], 0.982)); 1646 assert(ae(res3.p[2], 0.708)); 1647 assert(ae(res3.p[3], 0.965)); 1648 assert(ae(res3.aic, 12.544)); 1649 assert(ae(res3.nullLogLikelihood, -0.5 * 11.0904)); 1650 assert(ae(res3.logLikelihood, -0.5 * 4.5442)); 1651 // Not testing confidence intervals b/c they'd be so buried in numerical 1652 // fuzz. 1653 1654 1655 // Test with a just plain huge dataset that R chokes for several minutes 1656 // on. If you think this unittest is slow, try getting the reference 1657 // values from R. 1658 auto y4 = chain( 1659 take(cycle([0,0,0,0,1]), 500_000), 1660 take(cycle([1,1,1,1,0]), 500_000)); 1661 auto x4_1 = iota(0, 1_000_000); 1662 auto x4_2 = array(map!(compose!(exp, "a / 1_000_000.0"))(x4_1)); 1663 auto x4_3 = take(cycle([1,2,3,4,5]), 1_000_000); 1664 auto x4_4 = take(cycle([8,6,7,5,3,0,9]), 1_000_000); 1665 auto res4 = logisticRegress(y4, repeat(1), x4_1, x4_2, x4_3, x4_4, 0.99); 1666 assert(ae(res4.betas[0], -1.574)); 1667 assert(ae(res4.betas[1], 5.625e-6)); 1668 assert(ae(res4.betas[2], -7.282e-1)); 1669 assert(ae(res4.betas[3], -4.381e-6)); 1670 assert(ae(res4.betas[4], -8.343e-6)); 1671 assert(ae(res4.stdErr[0], 3.693e-2)); 1672 assert(ae(res4.stdErr[1], 7.188e-8)); 1673 assert(ae(res4.stdErr[2], 4.208e-2)); 1674 assert(ae(res4.stdErr[3], 1.658e-3)); 1675 assert(ae(res4.stdErr[4], 8.164e-4)); 1676 assert(ae(res4.p[0], 0)); 1677 assert(ae(res4.p[1], 0)); 1678 assert(ae(res4.p[2], 0)); 1679 assert(ae(res4.p[3], 0.998)); 1680 assert(ae(res4.p[4], 0.992)); 1681 assert(ae(res4.aic, 1089339)); 1682 assert(ae(res4.nullLogLikelihood, -0.5 * 1386294)); 1683 assert(ae(res4.logLikelihood, -0.5 * 1089329)); 1684 assert(ae(res4.lowerBound[0], -1.668899)); 1685 assert(ae(res4.lowerBound[1], 5.439787e-6)); 1686 assert(ae(res4.lowerBound[2], -0.8366273)); 1687 assert(ae(res4.lowerBound[3], -4.27406e-3)); 1688 assert(ae(res4.lowerBound[4], -2.111240e-3)); 1689 assert(ae(res4.upperBound[0], -1.478623)); 1690 assert(ae(res4.upperBound[1], 5.810089e-6)); 1691 assert(ae(res4.upperBound[2], -6.198418e-1)); 1692 assert(ae(res4.upperBound[3], 4.265302e-3)); 1693 assert(ae(res4.upperBound[4], 2.084554e-3)); 1694 1695 // Test ridge stuff. 1696 auto ridge2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 3); 1697 assert(ae(ridge2[0], -0.40279319)); 1698 assert(ae(ridge2[1], 0.03575638)); 1699 assert(ae(ridge2[2], 0.05313875)); 1700 1701 auto ridge2_2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2, 2); 1702 assert(ae(ridge2_2[0], -0.51411490)); 1703 assert(ae(ridge2_2[1], 0.04536590)); 1704 assert(ae(ridge2_2[2], 0.06809964)); 1705 } 1706 1707 /// The logistic function used in logistic regression. 1708 double logistic(double xb) pure nothrow @safe { 1709 return 1.0 / (1 + exp(-xb)); 1710 } 1711 1712 /** 1713 Performs lasso (L1) and/or ridge (L2) penalized logistic regression. Due to the 1714 way the data is standardized, no intercept term should be included in x 1715 (unlike logisticRegress and logisticRegressBeta). The intercept coefficient is 1716 implicitly included and returned in the first element of the returned array. 1717 Usage is otherwise identical. 1718 1719 Note: Setting lasso equal to zero is equivalent to performing ridge regression. 1720 This can also be done with logisticRegressBeta. However, the 1721 logisticRegressBeta algorithm is optimized for memory efficiency and 1722 large samples. This algorithm is optimized for large feature sets. 1723 1724 Returns: The beta coefficients for the regression model. 1725 1726 References: 1727 1728 Friedman J, et al Pathwise coordinate optimization. Ann. Appl. Stat. 1729 2007;2:302-332. 1730 1731 Goeman, J. J., L1 penalized estimation in the Cox proportional hazards model. 1732 Biometrical Journal 52(1), 70{84. 1733 1734 Eilers, P., Boer, J., Van Ommen, G., Van Houwelingen, H. 2001 Classification of 1735 microarray data with penalized logistic regression. Proceedings of SPIE. 1736 Progress in Biomedical Optics and Images vol. 4266, pp. 187-198 1737 1738 Douglas M. Hawkins and Xiangrong Yin. A faster algorithm for ridge regression 1739 of reduced rank data. Computational Statistics & Data Analysis Volume 40, 1740 Issue 2, 28 August 2002, Pages 253-262 1741 1742 http://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula 1743 */ 1744 double[] logisticRegressPenalized(Y, X...) 1745 (Y yIn, X xIn, double lasso, double ridge) { 1746 auto alloc = newRegionAllocator(); 1747 1748 static assert(!isInfinite!Y, "Can't do regression with infinite # of Y's."); 1749 static if(isRandomAccessRange!Y) { 1750 alias yIn y; 1751 } else { 1752 auto y = toBools(yIn); 1753 } 1754 1755 static if(X.length == 1 && isRoR!X) { 1756 enum bool tupleMode = false; 1757 static if(isForwardRange!X) { 1758 auto x = toRandomAccessRoR(y.length, xIn, alloc); 1759 } else { 1760 auto x = toRandomAccessRoR(y.length, alloc.array(xIn), alloc); 1761 } 1762 } else { 1763 enum bool tupleMode = true; 1764 auto xx = toRandomAccessTuple(xIn, alloc); 1765 auto x = xx.expand; 1766 } 1767 1768 auto betas = new double[x.length + 1]; 1769 if(y.length >= x.length && lasso == 0) { 1770 // Add intercept term. 1771 static if(tupleMode) { 1772 doMLENewton(betas, (double[]).init, ridge, y, repeat(1), x); 1773 } else static if(is(x == double[][])) { 1774 auto xInt = alloc.uninitializedArray!(double[])(betas.length); 1775 xInt[1..$] = x[]; 1776 xInt[0] = alloc.array(repeat(1.0, y.length)); 1777 doMLENewton(betas, (double[]).init, ridge, y, xInt); 1778 } else { 1779 // No choice but to dup the whole thing. 1780 auto xInt = alloc.uninitializedArray!(double[][])(betas.length); 1781 xInt[0] = alloc.array(repeat(1.0, y.length)); 1782 1783 foreach(i, ref arr; xInt[1..$]) { 1784 arr = alloc.array( 1785 map!(to!double)(take(x[i], y.length)) 1786 ); 1787 } 1788 1789 doMLENewton(betas, (double[]).init, ridge, y, xInt); 1790 } 1791 1792 } else { 1793 logisticRegressPenalizedImpl(betas, lasso, ridge, y, x); 1794 } 1795 1796 return betas; 1797 } 1798 1799 unittest { 1800 // Test ridge regression. We have three impls for all kinds of diff. 1801 // scenarios. See if they all agree. Note that the ridiculously small but 1802 // nonzero lasso param is to force the use of the coord descent algo. 1803 auto y = new bool[12]; 1804 auto x = new double[][16]; 1805 foreach(ref elem; x) elem = new double[12]; 1806 x[0][] = 1; 1807 auto gen = Random(31415); // For random but repeatable results. 1808 1809 foreach(iter; 0..1000) { 1810 foreach(col; x[1..$]) foreach(ref elem; col) elem = rNormal(0, 1, gen); 1811 1812 // Nothing will converge if y is all true's or all false's. 1813 size_t trueCount; 1814 do { 1815 foreach(ref elem; y) elem = cast(bool) rBernoulli(0.5, gen); 1816 trueCount = count!"a"(y); 1817 } while(trueCount == 0 || trueCount == y.length); 1818 1819 immutable ridge = uniform(0.1, 10.0, gen); 1820 1821 auto normalEq = logisticRegressBeta(y, x, ridge); 1822 auto coordDescent = logisticRegressPenalized( 1823 y, x[1..$], double.min_normal, ridge); 1824 auto linalgTrick = logisticRegressPenalized(y, x[1..$], 0, ridge); 1825 1826 // Every once in a blue moon coordinate descent doesn't converge that 1827 // well. These small errors are of no practical significance, hence 1828 // the wide tolerance. However, if the direct normal equations 1829 // and linalg trick don't agree extremely closely, then something's 1830 // fundamentally wrong. 1831 assert(approxEqual(normalEq, coordDescent, 0.02, 1e-4), text( 1832 normalEq, coordDescent)); 1833 assert(approxEqual(linalgTrick, coordDescent, 0.02, 1e-4), text( 1834 linalgTrick, coordDescent)); 1835 assert(approxEqual(normalEq, linalgTrick, 1e-6, 1e-8), text( 1836 normalEq, linalgTrick)); 1837 } 1838 1839 assert(approxEqual(logisticRegressBeta(y, x[0], x[1], x[2]), 1840 logisticRegressPenalized(y, x[1], x[2], 0, 0))); 1841 assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]), 1842 logisticRegressPenalized(y, [x[1], x[2]], 0, 0))); 1843 assert(approxEqual(logisticRegressBeta(y, [x[0], x[1], x[2]]), 1844 logisticRegressPenalized(y, 1845 [to!(float[])(x[1]), to!(float[])(x[2])], 0, 0))); 1846 1847 // Make sure the adding intercept stuff is right for the Newton path. 1848 //assert(logisticRegressBeta(x[0], x[1], x[2]) == 1849 1850 // Test stuff that's got some lasso in it. Values from R's Penalized 1851 // package. 1852 y = [1, 0, 0, 1, 1, 1, 0]; 1853 x = [[8.0, 6, 7, 5, 3, 0, 9], 1854 [3.0, 6, 2, 4, 3, 6, 8], 1855 [3.0, 1, 4, 1, 5, 9, 2], 1856 [2.0, 7, 1, 8, 2, 8, 1]]; 1857 1858 // Values from R's Penalized package. Note that it uses a convention for 1859 // the ridge parameter such that Penalized ridge = 2 * dstats ridge. 1860 assert(approxEqual(logisticRegressPenalized(y, x, 1, 0), 1861 [1.642080, -0.22086515, -0.02587546, 0.00000000, 0.00000000 ])); 1862 assert(approxEqual(logisticRegressPenalized(y, x, 1, 3), 1863 [0.5153373, -0.04278257, -0.00888014, 0.01316831, 0.00000000])); 1864 assert(approxEqual(logisticRegressPenalized(y, x, 2, 0.1), 1865 [0.2876821, 0, 0., 0., 0])); 1866 assert(approxEqual(logisticRegressPenalized(y, x, 1.2, 7), 1867 [0.367613 , -0.017227631, 0.000000000, 0.003875104, 0.000000000])); 1868 } 1869 1870 /** 1871 This function performs loess regression. Loess regression is a local 1872 regression procedure, where a prediction of the dependent (y) variable 1873 is made from an observation of the independent (x) variable by weighted 1874 least squares over x values in the neighborhood of the value being evaluated. 1875 1876 In the future a separate function may be included to perform loess regression 1877 with multiple predictors. However, one predictor is the much more common 1878 case and the multiple predictor case will require a much different API 1879 and implementation, so for now only one predictor is supported. 1880 1881 Params: 1882 1883 y = Observations of the dependent variable. 1884 x = Observations of the independent variable. 1885 span = The fraction of x observations considered to be "in the neighborhood" 1886 when performing local regression to predict the y value for a new x. 1887 For example, if 8 observations are provided and span == 0.5, 1888 the 4 nearest neighbors will be used on evaluation. 1889 degree = The polynomial degree of the local regression. Must be less than 1890 the number of neighbors (span * x.length). 1891 1892 Returns: 1893 1894 A Loess1D object. This object can be used to make predictions based on 1895 the loess model. The computations involved are done lazily, i.e. this 1896 function sets up the Loess1D instance and returns without computing 1897 any regression models. 1898 1899 Examples: 1900 --- 1901 auto x = [1, 2, 3, 4, 5, 6, 7]; 1902 auto y = [3, 6, 2, 4, 3, 6, 8]; 1903 1904 // Build the Loess1D object. 1905 auto model = loess1D(y, x, 0.5); 1906 1907 // Compute the weights for robust regression. 1908 model.computeRobustWeights(2); 1909 1910 // Predict the value of y when x == 5.5, using a robustness level of 2. 1911 auto prediction = model.predict(5.5, 2); 1912 --- 1913 1914 References: 1915 1916 Cleveland, W.S. (1979). "Robust Locally Weighted Regression and Smoothing 1917 Scatterplots". Journal of the American Statistical Association 74 (368): 1918 829-836. 1919 */ 1920 Loess1D loess1D(RX, RY)( 1921 RY y, 1922 RX x, 1923 double span, 1924 int degree = 1 1925 ) { 1926 dstatsEnforce(span > 0 && span <= 1, format( 1927 "Span must be >0 and <= 1 for loess1D, not %s.", span 1928 )); 1929 dstatsEnforce(degree >= 0, "Degree must be >= 0 for loess1D."); 1930 1931 auto ret = new Loess1D; 1932 ret._x = array(map!(to!double)(x)); 1933 ret._y = array(map!(to!double)(y)); 1934 qsort(ret._x, ret._y); 1935 1936 ret.nNeighbors = to!int(ret.x.length * span); 1937 dstatsEnforce(ret.nNeighbors > degree, format( 1938 "Cannot do degree %s loess with a window of only %s points. " ~ 1939 "Increase the span parameter.", degree, ret.nNeighbors 1940 )); 1941 1942 ret._degree = degree; 1943 auto xPoly = new double[][degree]; 1944 if(degree > 0) xPoly[0] = ret._x; 1945 1946 foreach(d; 2..degree + 1) { 1947 xPoly[d - 1] = ret._x.dup; 1948 xPoly[d - 1][] ^^= d; 1949 } 1950 1951 ret.xPoly = xPoly; 1952 return ret; 1953 } 1954 1955 unittest { 1956 auto x = [1, 2, 3, 4, 5, 6, 7, 8]; 1957 auto y = [3, 2, 8, 2, 6, 9, 0, 1]; 1958 1959 auto loess1 = loess1D(y, x, 0.75, 1); 1960 1961 // Values from R's lowess() function. This gets slightly different 1962 // results than loess(), probably due to disagreements bout windowing 1963 // details. 1964 assert(approxEqual(loess1.predictions(0), 1965 [2.9193046, 3.6620295, 4.2229953, 5.2642335, 5.3433985, 4.4225636, 1966 2.7719778, 0.6643268] 1967 )); 1968 1969 loess1 = loess1D(y, x, 0.5, 1); 1970 assert(approxEqual(loess1.predictions(0), 1971 [2.1615941, 4.0041736, 4.5642738, 4.8631052, 5.7136895, 5.5642738, 1972 2.8631052, -0.1977227] 1973 )); 1974 1975 assert(approxEqual(loess1.predictions(2), 1976 [2.2079526, 3.9809030, 4.4752888, 4.8849727, 5.7260333, 5.4465225, 1977 2.8769120, -0.1116018] 1978 )); 1979 1980 // Test 0th and 2nd order using R's loess() function since lowess() doesn't 1981 // support anything besides first degree. 1982 auto loess0 = loess1D(y, x, 0.5, 0); 1983 assert(approxEqual(loess0.predictions(0), 1984 [3.378961, 4.004174, 4.564274, 4.863105, 5.713689, 5.564274, 2.863105, 1985 1.845369] 1986 )); 1987 1988 // Not testing the last point. R's loess() consistently gets slightly 1989 // different answers for the last point than either this function or 1990 // R's lowess() for degree > 0. (This function and R's lowess() agree 1991 // when this happens.) It's not clear which is right but the differences 1992 // are small and not practically important. 1993 auto loess2 = loess1D(y, x, 0.75, 2); 1994 assert(approxEqual(loess2.predictions(0)[0..$ - 1], 1995 [2.4029984, 4.1021339, 4.8288941, 4.5523535, 6.0000000, 6.4476465, 1996 3.7669741] 1997 )); 1998 } 1999 2000 /** 2001 This class is returned from the loess1D function and holds the state of a 2002 loess regression with one predictor variable. 2003 */ 2004 final class Loess1D { 2005 private: 2006 double[] _x; 2007 double[][] xPoly; 2008 double[] _y; 2009 double[][] yHat; 2010 double[][] residualWeights; 2011 2012 int _degree; 2013 int nNeighbors; 2014 2015 size_t nearestNeighborX(double point) const { 2016 auto sortedX = assumeSorted(x); 2017 auto trisected = sortedX.trisect(point); 2018 2019 if(trisected[1].length) return trisected[0].length; 2020 if(!trisected[0].length) return 0; 2021 if(!trisected[2].length) return trisected[0].length - 1; 2022 2023 if(point - trisected[0][trisected[0].length - 1] < 2024 trisected[2][0] - point 2025 ) { 2026 return trisected[0].length - 1; 2027 } 2028 2029 return trisected[0].length; 2030 } 2031 2032 static void computexTWx( 2033 const double[][] xPoly, 2034 const(double)[] weights, 2035 ref DoubleMatrix covMatrix 2036 ) { 2037 foreach(i; 0..xPoly.length) foreach(j; 0..i + 1) { 2038 covMatrix[i + 1, j + 1] = covMatrix[j + 1, i + 1] = threeDot( 2039 xPoly[i], weights, xPoly[j] 2040 ); 2041 } 2042 2043 // Handle intercept terms 2044 foreach(i; 1..covMatrix.rows) { 2045 covMatrix[0, i] = covMatrix[i, 0] = dotProduct(weights, xPoly[i - 1]); 2046 } 2047 2048 covMatrix[0, 0] = sum(weights); 2049 } 2050 2051 static void computexTWy( 2052 const(double)[][] xPoly, 2053 const(double)[] y, 2054 const(double)[] weights, 2055 double[] ans 2056 ) { 2057 foreach(i; 0..xPoly.length) { 2058 ans[i + 1] = threeDot(xPoly[i], weights, y); 2059 } 2060 2061 // Intercept: 2062 ans[0] = dotProduct(weights, y); 2063 } 2064 2065 public: 2066 /** 2067 Predict the value of y when x == point, using robustness iterations 2068 of the biweight procedure outlined in the reference to make the 2069 estimates more robust. 2070 2071 Notes: 2072 2073 This function is computationally intensive but may be called 2074 from multiple threads simultaneously. When predicting a 2075 large number of points, a parallel foreach loop may be used. 2076 2077 Before calling this function with robustness > 0, 2078 computeRobustWeights() must be called. See this function for details. 2079 2080 Returns: The predicted y value. 2081 */ 2082 double predict(double point, int robustness = 0) const { 2083 dstatsEnforce(cast(int) residualWeights.length >= robustness, 2084 "For robust loess1D estimates, computeRobustWeights() " ~ 2085 "must be called with the proper robustness level before " ~ 2086 "calling predict()." 2087 ); 2088 2089 auto alloc = newRegionAllocator(); 2090 auto covMatrix = doubleMatrix(degree + 1, degree + 1, alloc); 2091 auto xTy = alloc.uninitializedArray!(double[])(degree + 1); 2092 auto betas = alloc.uninitializedArray!(double[])(degree + 1); 2093 auto xPolyNeighbors = alloc.array(xPoly); 2094 2095 size_t firstNeighbor = nearestNeighborX(point); 2096 size_t lastNeighbor = firstNeighbor; 2097 2098 while(lastNeighbor - firstNeighbor + 1 < nNeighbors) { 2099 immutable upperDiff = (lastNeighbor + 1 >= x.length) ? 2100 double.infinity : (x[lastNeighbor + 1] - point); 2101 immutable lowerDiff = (firstNeighbor == 0) ? 2102 double.infinity : (point - x[firstNeighbor - 1]); 2103 2104 if(upperDiff < lowerDiff) { 2105 lastNeighbor++; 2106 } else { 2107 firstNeighbor--; 2108 } 2109 } 2110 2111 foreach(j, col; xPoly) { 2112 xPolyNeighbors[j] = xPoly[j][firstNeighbor..lastNeighbor + 1]; 2113 } 2114 auto yNeighbors = y[firstNeighbor..lastNeighbor + 1]; 2115 immutable maxDist = computeMaxDist(x[firstNeighbor..lastNeighbor + 1], point); 2116 auto w = alloc.uninitializedArray!(double[])(yNeighbors.length); 2117 foreach(j, neighbor; x[firstNeighbor..lastNeighbor + 1]) { 2118 immutable diff = abs(point - neighbor); 2119 w[j] = max(0, (1 - (diff / maxDist) ^^ 3) ^^ 3); 2120 if(robustness > 0) { 2121 w[j] *= residualWeights[robustness - 1][j + firstNeighbor]; 2122 } 2123 } 2124 2125 computexTWx(xPolyNeighbors, w, covMatrix); 2126 computexTWy(xPolyNeighbors, w, yNeighbors, xTy); 2127 choleskySolve(covMatrix, xTy, betas); 2128 2129 double ret = 0; 2130 foreach(d; 0..degree + 1) { 2131 ret += betas[d] * point ^^ d; 2132 } 2133 2134 return ret; 2135 } 2136 2137 /** 2138 Compute the weights for robust loess, for all robustness levels <= the 2139 robustness parameter. This computation is embarrassingly parallel, so if a 2140 TaskPool is provided it will be parallelized. 2141 2142 This function must be called before calling predict() with a robustness 2143 value > 0. computeRobustWeights() must be called with a robustness level 2144 >= the robustness level predict() is to be called with. This is not 2145 handled implicitly because computeRobustWeights() is very computationally 2146 intensive and because it modifies the state of the Loess1D object, while 2147 predict() is const. Forcing computeRobustWeights() to be called explicitly 2148 allows multiple instances of predict() to be evaluated in parallel. 2149 */ 2150 void computeRobustWeights(int robustness, TaskPool pool = null) { 2151 dstatsEnforce(robustness >= 0, "Robustness cannot be <0."); 2152 2153 if(cast(int) yHat.length < robustness) { 2154 computeRobustWeights(robustness - 1, pool); 2155 } 2156 2157 if(yHat.length >= robustness + 1) { 2158 return; 2159 } 2160 2161 if(robustness > 0) { 2162 auto resid = new double[y.length]; 2163 residualWeights ~= resid; 2164 resid[] = y[] - yHat[$ - 1][]; 2165 2166 foreach(ref r; resid) r = abs(r); 2167 immutable sixMed = 6 * median(resid); 2168 2169 foreach(ref r; resid) { 2170 r = biweight(r / sixMed); 2171 } 2172 } 2173 2174 yHat ~= new double[y.length]; 2175 2176 if(pool is null) { 2177 foreach(i, point; x) { 2178 yHat[robustness][i] = predict(point, robustness); 2179 } 2180 } else { 2181 foreach(i, point; pool.parallel(x, 1)) { 2182 yHat[robustness][i] = predict(point, robustness); 2183 } 2184 } 2185 } 2186 2187 /** 2188 Obtain smoothed predictions of y at the values of x provided on creation 2189 of this object, for the given level of robustness. Evaluating 2190 these is computationally expensive and may be parallelized by 2191 providing a TaskPool object. 2192 */ 2193 const(double)[] predictions(int robustness, TaskPool pool = null) { 2194 dstatsEnforce(robustness >= 0, "Robustness cannot be <0."); 2195 computeRobustWeights(robustness, pool); 2196 return yHat[robustness]; 2197 } 2198 2199 const pure nothrow @property @safe: 2200 2201 /// The polynomial degree for the local regression. 2202 int degree() { 2203 return _degree; 2204 } 2205 2206 /// The x values provided on object creation. 2207 const(double)[] x() { 2208 return _x; 2209 } 2210 2211 /// The y values provided on object creation. 2212 const(double)[] y() { 2213 return _y; 2214 } 2215 } 2216 2217 private: 2218 double biweight(double x) pure nothrow @safe { 2219 if(abs(x) >= 1) return 0; 2220 return (1 - x ^^ 2) ^^ 2; 2221 } 2222 2223 double computeMaxDist(const double[] stuff, double x) pure nothrow @safe { 2224 double ret = 0; 2225 foreach(elem; stuff) { 2226 ret = max(ret, abs(elem - x)); 2227 } 2228 2229 return ret; 2230 } 2231 2232 double absMax(double a, double b) { 2233 return max(abs(a), abs(b)); 2234 } 2235 2236 LogisticRes logisticRegressImpl(T, V...) 2237 (bool inference, double ridge, T yIn, V input) { 2238 auto alloc = newRegionAllocator(); 2239 2240 static if(isFloatingPoint!(V[$ - 1])) { 2241 alias input[$ - 1] conf; 2242 alias V[0..$ - 1] U; 2243 alias input[0..$ - 1] xIn; 2244 enforceConfidence(conf); 2245 } else { 2246 alias V U; 2247 alias input xIn; 2248 enum conf = 0.95; 2249 } 2250 2251 static assert(!isInfinite!T, "Can't do regression with infinite # of Y's."); 2252 static if(isRandomAccessRange!T) { 2253 alias yIn y; 2254 } else { 2255 auto y = toBools(yIn, alloc); 2256 } 2257 2258 static if(U.length == 1 && isRoR!U) { 2259 static if(isForwardRange!U) { 2260 auto x = toRandomAccessRoR(y.length, xIn, alloc); 2261 } else { 2262 auto x = toRandomAccessRoR(y.length, alloc.array(xIn), alloc); 2263 } 2264 } else { 2265 auto xx = toRandomAccessTuple(xIn, alloc); 2266 auto x = xx.expand; 2267 } 2268 2269 typeof(return) ret; 2270 ret.betas.length = x.length; 2271 if(inference) ret.stdErr.length = x.length; 2272 ret.logLikelihood = doMLENewton(ret.betas, ret.stdErr, ridge, y, x); 2273 2274 if(!inference) return ret; 2275 2276 static bool hasNaNs(R)(R range) { 2277 return !filter!isNaN(range).empty; 2278 } 2279 2280 if(isNaN(ret.logLikelihood) || hasNaNs(ret.betas) || hasNaNs(ret.stdErr)) { 2281 // Then we didn't converge or our data was defective. 2282 return ret; 2283 } 2284 2285 ret.nullLogLikelihood = .priorLikelihood(y); 2286 double lratio = ret.logLikelihood - ret.nullLogLikelihood; 2287 2288 // Compensate for numerical fuzz. 2289 if(lratio < 0 && lratio > -1e-5) lratio = 0; 2290 if(lratio >= 0 && x.length >= 2) { 2291 ret.overallP = chiSquareCDFR(2 * lratio, x.length - 1); 2292 } 2293 2294 ret.p.length = x.length; 2295 ret.lowerBound.length = x.length; 2296 ret.upperBound.length = x.length; 2297 immutable nDev = -invNormalCDF((1 - conf) / 2); 2298 foreach(i; 0..x.length) { 2299 ret.p[i] = 2 * normalCDF(-abs(ret.betas[i]) / ret.stdErr[i]); 2300 ret.lowerBound[i] = ret.betas[i] - nDev * ret.stdErr[i]; 2301 ret.upperBound[i] = ret.betas[i] + nDev * ret.stdErr[i]; 2302 } 2303 2304 return ret; 2305 } 2306 2307 private void logisticRegressPenalizedImpl(Y, X...) 2308 (double[] betas, double lasso, double ridge, Y y, X xIn) { 2309 static if(isRoR!(X[0]) && X.length == 1) { 2310 alias xIn[0] x; 2311 } else { 2312 alias xIn x; 2313 } 2314 2315 auto alloc = newRegionAllocator(); 2316 auto ps = alloc.uninitializedArray!(double[])(y.length); 2317 betas[] = 0; 2318 auto betasRaw = alloc.array(betas[1..$]); 2319 2320 double oldLikelihood = -double.infinity; 2321 double oldPenalty2 = double.infinity; 2322 double oldPenalty1 = double.infinity; 2323 enum eps = 1e-6; 2324 enum maxIter = 1000; 2325 2326 auto weights = alloc.uninitializedArray!(double[])(y.length); 2327 auto z = alloc.uninitializedArray!(double[])(y.length); 2328 auto xMeans = alloc.uninitializedArray!(double[])(x.length); 2329 auto xSds = alloc.uninitializedArray!(double[])(x.length); 2330 double zMean = 0; 2331 auto xCenterScale = alloc.uninitializedArray!(double[][])(x.length); 2332 foreach(ref col; xCenterScale) col = alloc.uninitializedArray!(double[])(y.length); 2333 2334 // Puts x in xCenterScale, with weighted mean subtracted and weighted 2335 // biased stdev divided. Also standardizes z similarly. 2336 // 2337 // Returns: true if successful, false if weightSum is so small that the 2338 // algorithm has converged for all practical purposes. 2339 bool doCenterScale() { 2340 immutable weightSum = sum(weights); 2341 if(weightSum < eps) return false; 2342 2343 xMeans[] = 0; 2344 zMean = 0; 2345 xSds[] = 0; 2346 2347 // Do copying. 2348 foreach(i, col; x) { 2349 copy(take(col, y.length), xCenterScale[i]); 2350 } 2351 2352 // Compute weighted means. 2353 foreach(j, col; xCenterScale) { 2354 foreach(i, w; weights) { 2355 xMeans[j] += w * col[i]; 2356 } 2357 } 2358 2359 foreach(i, w; weights) { 2360 zMean += w * z[i]; 2361 } 2362 2363 xMeans[] /= weightSum; 2364 zMean /= weightSum; 2365 z[] -= zMean; 2366 foreach(i, col; xCenterScale) col[] -= xMeans[i]; 2367 2368 // Compute biased stdevs. 2369 foreach(j, ref sd; xSds) { 2370 sd = sqrt(meanStdev(xCenterScale[j]).mse); 2371 } 2372 2373 foreach(i, col; xCenterScale) col[] /= xSds[i]; 2374 return true; 2375 } 2376 2377 // Rescales the beta coefficients to undo the effects of standardizing. 2378 void rescaleBetas() { 2379 betas[0] = zMean; 2380 foreach(i, b; betasRaw) { 2381 betas[i + 1] = b / xSds[i]; 2382 betas[0] -= betas[i + 1] * xMeans[i]; 2383 } 2384 } 2385 2386 foreach(iter; 0..maxIter) { 2387 evalPs(betas[0], ps, betas[1..$], x); 2388 immutable lh = logLikelihood(ps, y); 2389 immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas); 2390 immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas); 2391 2392 if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps 2393 && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps 2394 && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) { 2395 return; 2396 } else if(isNaN(lh) || isNaN(penalty2) || isNaN(penalty1)) { 2397 betas[] = double.nan; 2398 return; 2399 } 2400 2401 oldPenalty2 = penalty2; 2402 oldPenalty1 = penalty1; 2403 oldLikelihood = lh; 2404 2405 foreach(i, ref w; weights) { 2406 w = (ps[i] * (1 - ps[i])); 2407 } 2408 2409 z[] = betas[0]; 2410 foreach(i, col; x) { 2411 static if(is(typeof(col) : const(double)[])) { 2412 z[] += col[] * betas[i + 1]; 2413 } else { 2414 foreach(j, ref elem; z) { 2415 elem += col[j] * betas[i + 1]; 2416 } 2417 } 2418 } 2419 2420 foreach(i, w; weights) { 2421 if(w == 0){ 2422 z[i] = 0; 2423 } else { 2424 immutable double yi = (y[i] == 0) ? 0.0 : 1.0; 2425 z[i] += (yi - ps[i]) / w; 2426 } 2427 } 2428 2429 immutable centerScaleRes = doCenterScale(); 2430 2431 // If this is false then weightSum is so small that all probabilities 2432 // are for all practical purposes either 0 or 1. We can declare 2433 // convergence and go home. 2434 if(!centerScaleRes) return; 2435 2436 if(lasso > 0) { 2437 // Correct for different conventions in defining ridge params 2438 // so all functions get the same answer. 2439 immutable ridgeCorrected = ridge * 2.0; 2440 coordDescent(z, xCenterScale, betasRaw, 2441 lasso, ridgeCorrected, weights); 2442 } else { 2443 // Correct for different conventions in defining ridge params 2444 // so all functions get the same answer. 2445 immutable ridgeCorrected = ridge * 2.0; 2446 ridgeLargeP(z, xCenterScale, ridgeCorrected, betasRaw, weights); 2447 } 2448 2449 rescaleBetas(); 2450 } 2451 2452 immutable lh = logLikelihood(ps, y); 2453 immutable penalty2 = ridge * reduce!"a + b * b"(0.0, betas); 2454 immutable penalty1 = lasso * reduce!"a + (b < 0) ? -b : b"(0.0, betas); 2455 2456 if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps 2457 && (oldPenalty2 - penalty2) / (absMax(penalty2, oldPenalty2) + 0.1) < eps 2458 && (oldPenalty1 - penalty1) / (absMax(penalty1, oldPenalty1) + 0.1) < eps) { 2459 return; 2460 } else { 2461 // If we got here, we haven't converged. Return NaNs instead of bogus 2462 // values. 2463 betas[] = double.nan; 2464 } 2465 } 2466 2467 // Calculate the mean squared error of all ranges. This is delicate, though, 2468 // because some may be infinite and we want to stop at the shortest range. 2469 double[] calculateMSEs(U...)(U xIn, RegionAllocator alloc) { 2470 static if(isRoR!(U[0]) && U.length == 1) { 2471 alias xIn[0] x; 2472 } else { 2473 alias xIn x; 2474 } 2475 2476 size_t minLen = size_t.max; 2477 foreach(r; x) { 2478 static if(!isInfinite!(typeof(r))) { 2479 static assert(hasLength!(typeof(r)), 2480 "Ranges passed to doMLENewton should be random access, meaning " ~ 2481 "either infinite or with length."); 2482 2483 minLen = min(minLen, r.length); 2484 } 2485 } 2486 2487 dstatsEnforce(minLen < size_t.max, 2488 "Can't do logistic regression if all of the ranges are infinite."); 2489 2490 auto ret = alloc.uninitializedArray!(double[])(x.length); 2491 foreach(ti, range; x) { 2492 //Workaround for bug https://issues.dlang.org/show_bug.cgi?id=13151 2493 //can be removed once no more need for 2.066.* support 2494 static if(is(typeof(range.save) R == Take!T, T)) { 2495 auto saved = range.save; 2496 ret[ti] = meanStdev(R(saved.source, min(minLen, saved.maxLength))).mse; 2497 } else { 2498 ret[ti] = meanStdev(take(range.save, minLen)).mse; 2499 } 2500 } 2501 2502 return ret; 2503 } 2504 2505 double doMLENewton(T, U...) 2506 (double[] beta, double[] stdError, double ridge, T y, U xIn) { 2507 // This big, disgusting function uses the Newton-Raphson method as outlined 2508 // in http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf 2509 // 2510 // The matrix operations are kind of obfuscated because they're written 2511 // using very low-level primitives and with as little temp space as 2512 // possible used. 2513 static if(isRoR!(U[0]) && U.length == 1) { 2514 alias xIn[0] x; 2515 } else { 2516 alias xIn x; 2517 } 2518 2519 auto alloc = newRegionAllocator(); 2520 2521 double[] mses; // Used for ridge. 2522 if(ridge > 0) { 2523 mses = calculateMSEs(x, alloc); 2524 } 2525 2526 beta[] = 0; 2527 if(stdError.length) stdError[] = double.nan; 2528 2529 auto ps = alloc.uninitializedArray!(double[])(y.length); 2530 2531 double getPenalty() { 2532 if(ridge == 0) return 0; 2533 2534 double ret = 0; 2535 foreach(i, b; beta) { 2536 ret += ridge * mses[i] * b ^^ 2; 2537 } 2538 2539 return ret; 2540 } 2541 2542 enum eps = 1e-6; 2543 enum maxIter = 1000; 2544 2545 auto oldLikelihood = -double.infinity; 2546 double oldPenalty = -double.infinity; 2547 auto firstDerivTerms = alloc.uninitializedArray!(double[])(beta.length); 2548 2549 // matSaved saves mat for inverting to find std. errors, only if we 2550 // care about std. errors. 2551 auto mat = doubleMatrix(beta.length, beta.length, alloc); 2552 DoubleMatrix matSaved; 2553 2554 if(stdError.length) { 2555 matSaved = doubleMatrix(beta.length, beta.length, alloc); 2556 } 2557 2558 void saveMat() { 2559 foreach(i; 0..mat.rows) foreach(j; 0..mat.columns) { 2560 matSaved[i, j] = mat[i, j]; 2561 } 2562 } 2563 2564 void doStdErrs() { 2565 if(stdError.length) { 2566 // Here, we actually need to invert the information matrix. 2567 // We can use mat as scratch space since we don't need it 2568 // anymore. 2569 invert(matSaved, mat); 2570 2571 foreach(i; 0..beta.length) { 2572 stdError[i] = sqrt(mat[i, i]); 2573 } 2574 } 2575 } 2576 2577 auto updates = alloc.uninitializedArray!(double[])(beta.length); 2578 foreach(iter; 0..maxIter) { 2579 evalPs(ps, beta, x); 2580 immutable lh = logLikelihood(ps, y); 2581 immutable penalty = getPenalty(); 2582 2583 if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps 2584 && (oldPenalty - penalty) / (absMax(penalty, oldPenalty) + 0.1) < eps) { 2585 doStdErrs(); 2586 return lh; 2587 } else if(isNaN(lh)) { 2588 beta[] = double.nan; 2589 return lh; 2590 } 2591 2592 oldLikelihood = lh; 2593 oldPenalty = penalty; 2594 2595 foreach(i; 0..mat.rows) foreach(j; 0..mat.columns) { 2596 mat[i, j] = 0; 2597 } 2598 2599 // Calculate X' * V * X in the notation of our reference. Since 2600 // V is a diagonal matrix of ps[] * (1.0 - ps[]), we only have one 2601 // dimension representing it. Do this for the lower half, then 2602 // symmetrize the matrix. 2603 foreach(i, xi; x) foreach(j, xj; x[0..i + 1]) { 2604 foreach(k; 0..ps.length) { 2605 mat[i, j] += (ps[k] * (1 - ps[k])) * xi[k] * xj[k]; 2606 } 2607 } 2608 2609 symmetrize(mat); 2610 2611 if(stdError.length) { 2612 saveMat(); 2613 } 2614 2615 // Convert ps to ys - ps. 2616 foreach(pIndex, ref p; ps) { 2617 p = (y[pIndex] == 0) ? -p : (1 - p); 2618 } 2619 2620 // Compute X'(y - p). 2621 foreach(ti, xRange; x) { 2622 //Workaround for bug https://issues.dlang.org/show_bug.cgi?id=13151 2623 //can be removed once no more need for 2.066.* support 2624 static if(is(typeof(xRange) R == Take!T, T)) { 2625 firstDerivTerms[ti] = dotProduct( 2626 R(xRange.source, min(ps.length, xRange.maxLength)), 2627 ps); 2628 } else { 2629 firstDerivTerms[ti] = dotProduct(take(xRange, ps.length), ps); 2630 } 2631 } 2632 2633 // Add ridge penalties, if any. 2634 if(ridge > 0) { 2635 foreach(diagIndex, mse; mses) { 2636 mat[diagIndex, diagIndex] += 2 * ridge * mse; 2637 firstDerivTerms[diagIndex] -= beta[diagIndex] * 2 * ridge * mse; 2638 } 2639 } 2640 2641 choleskySolve(mat, firstDerivTerms, updates); 2642 beta[] += updates[]; 2643 2644 debug(print) writeln("Iter: ", iter); 2645 } 2646 2647 immutable lh = logLikelihood(ps, y); 2648 immutable penalty = getPenalty(); 2649 if((lh - oldLikelihood) / (absMax(lh, oldLikelihood) + 0.1) < eps 2650 && (oldPenalty - penalty) / (absMax(penalty, oldPenalty) + 0.1) < eps) { 2651 doStdErrs(); 2652 return lh; 2653 } else { 2654 // If we got here, we haven't converged. Return NaNs instead of bogus 2655 // values. 2656 beta[] = double.nan; 2657 return double.nan; 2658 } 2659 } 2660 2661 private double logLikelihood(Y)(double[] ps, Y y) { 2662 double sum = 0; 2663 size_t i = 0; 2664 foreach(yVal; y) { 2665 scope(exit) i++; 2666 if(yVal) { 2667 sum += log(ps[i]); 2668 } else { 2669 sum += log(1 - ps[i]); 2670 } 2671 } 2672 return sum; 2673 } 2674 2675 void evalPs(X...)(double[] ps, double[] beta, X xIn) { 2676 evalPs(0, ps, beta, xIn); 2677 } 2678 2679 void evalPs(X...)(double interceptTerm, double[] ps, double[] beta, X xIn) { 2680 static if(isRoR!(X[0]) && X.length == 1) { 2681 alias xIn[0] x; 2682 } else { 2683 alias xIn x; 2684 } 2685 2686 assert(x.length == beta.length); 2687 ps[] = interceptTerm; 2688 2689 foreach(i, range; x) { 2690 static if(hasLength!(typeof(range))) { 2691 assert(range.length == ps.length); 2692 } 2693 2694 static if(is(typeof(range) == double[])) { 2695 // Take advantage of array ops. 2696 ps[] += range[0..ps.length] * beta[i]; 2697 } else { 2698 immutable b = beta[i]; 2699 2700 size_t j = 0; 2701 foreach(elem; range) { 2702 if(j >= ps.length) break; 2703 ps[j++] += b * elem; 2704 } 2705 } 2706 } 2707 2708 foreach(ref elem; ps) elem = logistic(elem); 2709 } 2710 2711 template isRoR(T) { 2712 static if(!isInputRange!T) { 2713 enum isRoR = false; 2714 } else { 2715 enum isRoR = isInputRange!(typeof(T.init.front())); 2716 } 2717 } 2718 2719 template isFloatMat(T) { 2720 static if(is(T : const(float[][])) || 2721 is(T : const(real[][])) || is(T : const(double[][]))) { 2722 enum isFloatMat = true; 2723 } else { 2724 enum isFloatMat = false; 2725 } 2726 } 2727 2728 template NonRandomToArray(T) { 2729 static if(isRandomAccessRange!T) { 2730 alias T NonRandomToArray; 2731 } else { 2732 alias Unqual!(ElementType!(T))[] NonRandomToArray; 2733 } 2734 } 2735 2736 double priorLikelihood(Y)(Y y) { 2737 uint nTrue, n; 2738 foreach(elem; y) { 2739 n++; 2740 if(elem) nTrue++; 2741 } 2742 2743 immutable p = cast(double) nTrue / n; 2744 return nTrue * log(p) + (n - nTrue) * log(1 - p); 2745 } 2746 2747 bool[] toBools(R)(R range, RegionAllocator alloc) { 2748 return alloc.array(map!"(a) ? true : false"(range)); 2749 } 2750 2751 auto toRandomAccessRoR(T)(size_t len, T ror, RegionAllocator alloc) { 2752 static assert(isRoR!T); 2753 alias ElementType!T E; 2754 static if(isArray!T && isRandomAccessRange!E) { 2755 return ror; 2756 } else static if(!isArray!T && isRandomAccessRange!E) { 2757 // Shallow copy so we know it has cheap slicing and stuff, 2758 // even if it is random access. 2759 return alloc.array(ror); 2760 } else { 2761 alias ElementType!E EE; 2762 auto ret = alloc.uninitializedArray!(EE[])(walkLength(ror.save)); 2763 2764 foreach(ref col; ret) { 2765 scope(exit) ror.popFront(); 2766 col = alloc.uninitializedArray!(EE[])(len); 2767 2768 size_t i; 2769 foreach(elem; ror.front) { 2770 col[i++] = elem; 2771 } 2772 } 2773 2774 return ret; 2775 } 2776 } 2777 2778 auto toRandomAccessTuple(T...)(T input, RegionAllocator alloc) { 2779 Tuple!(staticMap!(NonRandomToArray, T)) ret; 2780 2781 foreach(ti, range; input) { 2782 static if(isRandomAccessRange!(typeof(range))) { 2783 ret.field[ti] = range; 2784 } else { 2785 ret.field[ti] = alloc.array(range); 2786 } 2787 } 2788 2789 return ret; 2790 } 2791 2792 // Borrowed and modified from Phobos's dotProduct() function, 2793 // Copyright Andrei Alexandrescu 2008 - 2009. Originally licensed 2794 // under the Boost Software License 1.0. (License text included 2795 // at the beginning of this file.) 2796 private double threeDot( 2797 const double[] x1, 2798 const double[] x2, 2799 const double[] x3 2800 ) in { 2801 assert(x1.length == x2.length); 2802 assert(x2.length == x3.length); 2803 } body { 2804 immutable n = x1.length; 2805 auto avec = x1.ptr, bvec = x2.ptr, cvec = x3.ptr; 2806 typeof(return) sum0 = 0, sum1 = 0; 2807 2808 const all_endp = avec + n; 2809 const smallblock_endp = avec + (n & ~3); 2810 const bigblock_endp = avec + (n & ~15); 2811 2812 for (; avec != bigblock_endp; avec += 16, bvec += 16, cvec += 16) 2813 { 2814 sum0 += avec[0] * bvec[0] * cvec[0]; 2815 sum1 += avec[1] * bvec[1] * cvec[1]; 2816 sum0 += avec[2] * bvec[2] * cvec[2]; 2817 sum1 += avec[3] * bvec[3] * cvec[3]; 2818 sum0 += avec[4] * bvec[4] * cvec[4]; 2819 sum1 += avec[5] * bvec[5] * cvec[5]; 2820 sum0 += avec[6] * bvec[6] * cvec[6]; 2821 sum1 += avec[7] * bvec[7] * cvec[7]; 2822 sum0 += avec[8] * bvec[8] * cvec[8]; 2823 sum1 += avec[9] * bvec[9] * cvec[9]; 2824 sum0 += avec[10] * bvec[10] * cvec[10]; 2825 sum1 += avec[11] * bvec[11] * cvec[11]; 2826 sum0 += avec[12] * bvec[12] * cvec[12]; 2827 sum1 += avec[13] * bvec[13] * cvec[13]; 2828 sum0 += avec[14] * bvec[14] * cvec[14]; 2829 sum1 += avec[15] * bvec[15] * cvec[15]; 2830 } 2831 2832 for (; avec != smallblock_endp; avec += 4, bvec += 4, cvec += 4) { 2833 sum0 += avec[0] * bvec[0] * cvec[0]; 2834 sum1 += avec[1] * bvec[1] * cvec[1]; 2835 sum0 += avec[2] * bvec[2] * cvec[2]; 2836 sum1 += avec[3] * bvec[3] * cvec[3]; 2837 } 2838 2839 sum0 += sum1; 2840 2841 /* Do trailing portion in naive loop. */ 2842 while (avec != all_endp) 2843 { 2844 sum0 += *avec * *bvec * *cvec; 2845 ++avec; 2846 ++bvec; 2847 ++cvec; 2848 } 2849 2850 return sum0; 2851 } 2852 2853 unittest { 2854 auto a = [1.0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]; 2855 auto b = a.dup; 2856 b[] *= 2; 2857 auto c = b.dup; 2858 c[] *= 3; 2859 immutable ans1 = threeDot(a, b, c); 2860 2861 double ans2 = 0; 2862 foreach(i; 0..21) { 2863 ans2 += a[i] * b[i] * c[i]; 2864 } 2865 2866 assert(approxEqual(ans1, ans2)); 2867 }