1 /**Pearson, Spearman and Kendall correlations, covariance. 2 * 3 * Author: David Simcha*/ 4 /* 5 * License: 6 * Boost Software License - Version 1.0 - August 17th, 2003 7 * 8 * Permission is hereby granted, free of charge, to any person or organization 9 * obtaining a copy of the software and accompanying documentation covered by 10 * this license (the "Software") to use, reproduce, display, distribute, 11 * execute, and transmit the Software, and to prepare derivative works of the 12 * Software, and to permit third-parties to whom the Software is furnished to 13 * do so, all subject to the following: 14 * 15 * The copyright notices in the Software and this entire statement, including 16 * the above license grant, this restriction and the following disclaimer, 17 * must be included in all copies of the Software, in whole or in part, and 18 * all derivative works of the Software, unless such copies or derivative 19 * works are solely in the form of machine-executable object code generated by 20 * a source language processor. 21 * 22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 25 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 26 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 27 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 28 * DEALINGS IN THE SOFTWARE. 29 */ 30 31 module dstats.cor; 32 33 import std.conv, std.range, std.typecons, std.exception, std.math, 34 std.traits, std.typetuple, std.algorithm, std.parallelism, std.numeric; 35 36 import dstats.sort, dstats.base, dstats.alloc, dstats.summary; 37 38 version(unittest) { 39 import std.stdio, dstats.random; 40 41 Random gen; 42 43 shared static this() 44 { 45 gen.seed(unpredictableSeed); 46 } 47 } 48 49 /**Convenience function for calculating Pearson correlation. 50 * When the term correlation is used unqualified, it is 51 * usually referring to this quantity. This is a parametric correlation 52 * metric and should not be used with extremely ill-behaved data. 53 * This function works with any pair of input ranges. 54 * 55 * Note: The PearsonCor struct returned by this function is alias this'd to the 56 * correlation coefficient. Therefore, the result from this function can 57 * be treated simply as a floating point number. 58 * 59 * References: Computing Higher-Order Moments Online. 60 * http://people.xiph.org/~tterribe/notes/homs.html 61 */ 62 PearsonCor pearsonCor(T, U)(T input1, U input2) 63 if(doubleInput!(T) && doubleInput!(U)) { 64 PearsonCor corCalc; 65 66 static if(isRandomAccessRange!T && isRandomAccessRange!U && 67 hasLength!T && hasLength!U) { 68 69 // ILP parallelization optimization. Sharing a k between a bunch 70 // of implicit PearsonCor structs cuts down on the amount of divisions 71 // necessary. Using nILP of them instead of one improves CPU pipeline 72 // performance by reducing data dependency. When the stack is 73 // properly aligned, this can result in about 2x speedups compared 74 // to simply submitting everything to a single PearsonCor struct. 75 dstatsEnforce(input1.length == input2.length, 76 "Ranges must be same length for Pearson correlation."); 77 78 enum nILP = 8; 79 size_t i = 0; 80 if(input1.length > 2 * nILP) { 81 82 double _k = 0; 83 double[nILP] _mean1 = 0, _mean2 = 0, _var1 = 0, _var2 = 0, _cov = 0; 84 85 for(; i + nILP < input1.length; i += nILP) { 86 immutable kMinus1 = _k; 87 immutable kNeg1 = 1 / ++_k; 88 89 foreach(j; StaticIota!nILP) { 90 immutable double delta1 = input1[i + j] - _mean1[j]; 91 immutable double delta2 = input2[i + j] - _mean2[j]; 92 immutable delta1N = delta1 * kNeg1; 93 immutable delta2N = delta2 * kNeg1; 94 95 _mean1[j] += delta1N; 96 _var1[j] += kMinus1 * delta1N * delta1; 97 _cov[j] += kMinus1 * delta1N * delta2; 98 _var2[j] += kMinus1 * delta2N * delta2; 99 _mean2[j] += delta2N; 100 } 101 } 102 103 corCalc._k = _k; 104 corCalc._mean1 = _mean1[0]; 105 corCalc._mean2 = _mean2[0]; 106 corCalc._var1 = _var1[0]; 107 corCalc._var2 = _var2[0]; 108 corCalc._cov = _cov[0]; 109 110 foreach(j; 1..nILP) { 111 auto tmp = PearsonCor(_k, _mean1[j], _mean2[j], 112 _var1[j], _var2[j], _cov[j]); 113 corCalc.put(tmp); 114 } 115 } 116 117 // Handle remainder. 118 for(; i < input1.length; i++) { 119 corCalc.put(input1[i], input2[i]); 120 } 121 122 } else { 123 while(!input1.empty && !input2.empty) { 124 corCalc.put(input1.front, input2.front); 125 input1.popFront; 126 input2.popFront; 127 } 128 129 dstatsEnforce(input1.empty && input2.empty, 130 "Ranges must be same length for Pearson correlation."); 131 } 132 133 return corCalc; 134 } 135 136 unittest { 137 assert(approxEqual(pearsonCor([1,2,3,4,5][], [1,2,3,4,5][]).cor, 1)); 138 assert(approxEqual(pearsonCor([1,2,3,4,5][], [10.0, 8.0, 6.0, 4.0, 2.0][]).cor, -1)); 139 assert(approxEqual(pearsonCor([2, 4, 1, 6, 19][], [4, 5, 1, 3, 2][]).cor, -.2382314)); 140 141 // Make sure everything works with lowest common denominator range type. 142 static struct Count { 143 uint num; 144 uint upTo; 145 @property size_t front() { 146 return num; 147 } 148 void popFront() { 149 num++; 150 } 151 @property bool empty() { 152 return num >= upTo; 153 } 154 } 155 156 Count a, b; 157 a.upTo = 100; 158 b.upTo = 100; 159 assert(approxEqual(pearsonCor(a, b).cor, 1)); 160 161 PearsonCor cor1 = pearsonCor([1,2,4][], [2,3,5][]); 162 PearsonCor cor2 = pearsonCor([4,2,9][], [2,8,7][]); 163 PearsonCor combined = pearsonCor([1,2,4,4,2,9][], [2,3,5,2,8,7][]); 164 165 cor1.put(cor2); 166 167 foreach(ti, elem; cor1.tupleof) { 168 assert(approxEqual(elem, combined.tupleof[ti])); 169 } 170 171 assert(approxEqual(pearsonCor([1,2,3,4,5,6,7,8,9,10][], 172 [8,6,7,5,3,0,9,3,6,2][]).cor, -0.4190758)); 173 174 foreach(iter; 0..1000) { 175 // Make sure results for the ILP-optimized and non-optimized versions 176 // agree. 177 auto foo = randArray!(rNormal)(uniform(5, 100), 0, 1); 178 auto bar = randArray!(rNormal)(foo.length, 0, 1); 179 auto res1 = pearsonCor(foo, bar); 180 PearsonCor res2; 181 foreach(i; 0..foo.length) { 182 res2.put(foo[i], bar[i]); 183 } 184 185 foreach(ti, elem; res1.tupleof) { 186 assert(approxEqual(elem, res2.tupleof[ti])); 187 } 188 189 PearsonCor resCornerCase; // Test where one N is zero. 190 resCornerCase.put(res1); 191 PearsonCor dummy; 192 resCornerCase.put(dummy); 193 foreach(ti, elem; res1.tupleof) { 194 assert(isIdentical(resCornerCase.tupleof[ti], elem)); 195 } 196 } 197 } 198 199 /**Allows computation of mean, stdev, variance, covariance, Pearson correlation online. 200 * Getters for stdev, var, cov, cor cost floating point division ops. Getters 201 * for means cost a single branch to check for N == 0. This struct uses O(1) 202 * space. 203 * 204 * PearsonCor.cor is alias this'd, so if this struct is used as a float, it will 205 * be converted to a simple correlation coefficient automatically. 206 * 207 * Bugs: Alias this disabled due to compiler bugs. 208 * 209 * References: Computing Higher-Order Moments Online. 210 * http://people.xiph.org/~tterribe/notes/homs.html 211 */ 212 struct PearsonCor { 213 package: 214 double _k = 0, _mean1 = 0, _mean2 = 0, _var1 = 0, _var2 = 0, _cov = 0; 215 216 public: 217 alias cor this; 218 219 /// 220 void put(double elem1, double elem2) nothrow @safe { 221 immutable kMinus1 = _k; 222 immutable kNeg1 = 1 / ++_k; 223 immutable delta1 = elem1 - _mean1; 224 immutable delta2 = elem2 - _mean2; 225 immutable delta1N = delta1 * kNeg1; 226 immutable delta2N = delta2 * kNeg1; 227 228 _mean1 += delta1N; 229 _var1 += kMinus1 * delta1N * delta1; 230 _cov += kMinus1 * delta1N * delta2; 231 _var2 += kMinus1 * delta2N * delta2; 232 _mean2 += delta2N; 233 } 234 235 /// Combine two PearsonCor's. 236 void put(const ref typeof(this) rhs) nothrow @safe { 237 if(_k == 0) { 238 foreach(ti, elem; rhs.tupleof) { 239 this.tupleof[ti] = elem; 240 } 241 return; 242 } else if(rhs._k == 0) { 243 return; 244 } 245 246 immutable totalN = _k + rhs._k; 247 immutable delta1 = rhs.mean1 - mean1; 248 immutable delta2 = rhs.mean2 - mean2; 249 250 _mean1 = _mean1 * (_k / totalN) + rhs._mean1 * (rhs._k / totalN); 251 _mean2 = _mean2 * (_k / totalN) + rhs._mean2 * (rhs._k / totalN); 252 253 _var1 = _var1 + rhs._var1 + (_k / totalN * rhs._k * delta1 * delta1 ); 254 _var2 = _var2 + rhs._var2 + (_k / totalN * rhs._k * delta2 * delta2 ); 255 _cov = _cov + rhs._cov + (_k / totalN * rhs._k * delta1 * delta2 ); 256 _k = totalN; 257 } 258 259 const pure nothrow @property @safe { 260 261 /// 262 double var1() { 263 return (_k < 2) ? double.nan : _var1 / (_k - 1); 264 } 265 266 /// 267 double var2() { 268 return (_k < 2) ? double.nan : _var2 / (_k - 1); 269 } 270 271 /// 272 double stdev1() { 273 return sqrt(var1); 274 } 275 276 /// 277 double stdev2() { 278 return sqrt(var2); 279 } 280 281 /// 282 double cor() { 283 return cov / stdev1 / stdev2; 284 } 285 286 /// 287 double cov() { 288 return (_k < 2) ? double.nan : _cov / (_k - 1); 289 } 290 291 /// 292 double mean1() { 293 return (_k == 0) ? double.nan : _mean1; 294 } 295 296 /// 297 double mean2() { 298 return (_k == 0) ? double.nan : _mean2; 299 } 300 301 /// 302 double N() { 303 return _k; 304 } 305 306 } 307 } 308 309 /// 310 double covariance(T, U)(T input1, U input2) 311 if(doubleInput!(T) && doubleInput!(U)) { 312 return pearsonCor(input1, input2).cov; 313 } 314 315 unittest { 316 assert(approxEqual(covariance([1,4,2,6,3].dup, [3,1,2,6,2].dup), 2.05)); 317 } 318 319 /**Spearman's rank correlation. Non-parametric. This is essentially the 320 * Pearson correlation of the ranks of the data, with ties dealt with by 321 * averaging.*/ 322 double spearmanCor(R, S)(R input1, S input2) 323 if(isInputRange!(R) && isInputRange!(S) && 324 is(typeof(input1.front < input1.front) == bool) && 325 is(typeof(input2.front < input2.front) == bool)) { 326 327 static if(hasLength!S && hasLength!R) { 328 if(input1.length < 2) { 329 return double.nan; 330 } 331 } 332 333 auto alloc = newRegionAllocator(); 334 335 double[] spearmanCorRank(T)(T someRange) { 336 static if(hasLength!(T) && isRandomAccessRange!(T)) { 337 double[] ret = alloc.uninitializedArray!(double[])(someRange.length); 338 rank(someRange, ret); 339 } else { 340 auto iDup = alloc.array(someRange); 341 double[] ret = alloc.uninitializedArray!(double[])(iDup.length); 342 rankSort(iDup, ret); 343 } 344 return ret; 345 } 346 347 try { 348 auto ranks1 = spearmanCorRank(input1); 349 auto ranks2 = spearmanCorRank(input2); 350 dstatsEnforce(ranks1.length == ranks2.length, 351 "Ranges must be same length for Spearman correlation."); 352 353 return pearsonCor(ranks1, ranks2).cor; 354 } catch(SortException) { 355 return double.nan; 356 } 357 } 358 359 unittest { 360 //Test against a few known values. 361 assert(approxEqual(spearmanCor([1,2,3,4,5,6].dup, [3,1,2,5,4,6].dup), 0.77143)); 362 assert(approxEqual(spearmanCor([3,1,2,5,4,6].dup, [1,2,3,4,5,6].dup ), 0.77143)); 363 assert(approxEqual(spearmanCor([3,6,7,35,75].dup, [1,63,53,67,3].dup), 0.3)); 364 assert(approxEqual(spearmanCor([1,63,53,67,3].dup, [3,6,7,35,75].dup), 0.3)); 365 assert(approxEqual(spearmanCor([1.5,6.3,7.8,4.2,1.5].dup, [1,63,53,67,3].dup), .56429)); 366 assert(approxEqual(spearmanCor([1,63,53,67,3].dup, [1.5,6.3,7.8,4.2,1.5].dup), .56429)); 367 assert(approxEqual(spearmanCor([1.5,6.3,7.8,7.8,1.5].dup, [1,63,53,67,3].dup), .79057)); 368 assert(approxEqual(spearmanCor([1,63,53,67,3].dup, [1.5,6.3,7.8,7.8,1.5].dup), .79057)); 369 assert(approxEqual(spearmanCor([1.5,6.3,7.8,6.3,1.5].dup, [1,63,53,67,3].dup), .63246)); 370 assert(approxEqual(spearmanCor([1,63,53,67,3].dup, [1.5,6.3,7.8,6.3,1.5].dup), .63246)); 371 assert(approxEqual(spearmanCor([3,4,1,5,2,1,6,4].dup, [1,3,2,6,4,2,6,7].dup), .6829268)); 372 assert(approxEqual(spearmanCor([1,3,2,6,4,2,6,7].dup, [3,4,1,5,2,1,6,4].dup), .6829268)); 373 uint[] one = new uint[1000], two = new uint[1000]; 374 foreach(i; 0..100) { //Further sanity checks for things like commutativity. 375 size_t lowerBound = uniform(0, one.length); 376 size_t upperBound = uniform(0, one.length); 377 if(lowerBound > upperBound) swap(lowerBound, upperBound); 378 foreach(ref o; one) { 379 o = uniform(1, 10); //Generate lots of ties. 380 } 381 foreach(ref o; two) { 382 o = uniform(1, 10); //Generate lots of ties. 383 } 384 double sOne = 385 spearmanCor(one[lowerBound..upperBound], two[lowerBound..upperBound]); 386 double sTwo = 387 spearmanCor(two[lowerBound..upperBound], one[lowerBound..upperBound]); 388 foreach(ref o; one) 389 o*=-1; 390 double sThree = 391 -spearmanCor(one[lowerBound..upperBound], two[lowerBound..upperBound]); 392 double sFour = 393 -spearmanCor(two[lowerBound..upperBound], one[lowerBound..upperBound]); 394 foreach(ref o; two) o*=-1; 395 one[lowerBound..upperBound].reverse(); 396 two[lowerBound..upperBound].reverse(); 397 double sFive = 398 spearmanCor(one[lowerBound..upperBound], two[lowerBound..upperBound]); 399 assert(approxEqual(sOne, sTwo) || (isNaN(sOne) && isNaN(sTwo))); 400 assert(approxEqual(sTwo, sThree) || (isNaN(sThree) && isNaN(sTwo))); 401 assert(approxEqual(sThree, sFour) || (isNaN(sThree) && isNaN(sFour))); 402 assert(approxEqual(sFour, sFive) || (isNaN(sFour) && isNaN(sFive))); 403 } 404 405 // Test input ranges. 406 static struct Count { 407 uint num; 408 uint upTo; 409 @property size_t front() { 410 return num; 411 } 412 void popFront() { 413 num++; 414 } 415 @property bool empty() { 416 return num >= upTo; 417 } 418 } 419 420 Count a, b; 421 a.upTo = 100; 422 b.upTo = 100; 423 assert(approxEqual(spearmanCor(a, b), 1)); 424 } 425 426 version(unittest) { 427 // Make sure when we call kendallCor, the large N version always executes. 428 private enum kendallSmallN = 1; 429 } else { 430 private enum kendallSmallN = 15; 431 } 432 433 package template isDefaultSorted(R) { 434 static if(!is(typeof(R.init.release()))) { 435 enum isDefaultSorted = false; 436 } else { 437 enum isDefaultSorted = 438 is(R == SortedRange!(typeof(R.init.release()), "a < b")); 439 } 440 } 441 442 unittest { 443 import std.algorithm; 444 auto foo = sort([1, 2, 3]); 445 auto bar = sort!"a > b"([1, 2, 3]); 446 static assert(isDefaultSorted!(typeof(foo))); 447 static assert(!isDefaultSorted!(typeof(bar))); 448 static assert(!isDefaultSorted!(typeof([1, 2, 3]))); 449 } 450 451 /** 452 Kendall's Tau-b, O(N log N) version. This is a non-parametric measure 453 of monotonic association and can be defined in terms of the 454 bubble sort distance, or the number of swaps that would be needed in a 455 bubble sort to sort input2 into the same order as input1. 456 457 Since a copy of the inputs is made anyhow because they need to be sorted, 458 this function can work with any input range. However, the ranges must 459 have the same length. 460 461 Note: 462 463 As an optimization, when a range is a SortedRange with predicate "a < b", 464 it is assumed already sorted and not sorted a second time by this function. 465 This is useful when applying this function multiple times with one of the 466 arguments the same every time: 467 468 --- 469 auto lhs = randArray!rNormal(1_000, 0, 1); 470 auto indices = new size_t[1_000]; 471 makeIndex(lhs, indices); 472 473 foreach(i; 0..1_000) { 474 auto rhs = randArray!rNormal(1_000, 0, 1); 475 auto lhsSorted = assumeSorted( 476 indexed(lhs, indices) 477 ); 478 479 // Rearrange rhs according to the sorting permutation of lhs. 480 // kendallCor(lhsSorted, rhsRearranged) will be much faster than 481 // kendallCor(lhs, rhs). 482 auto rhsRearranged = indexed(rhs, indices); 483 assert(kendallCor(lhsSorted, rhsRearranged) == kendallCor(lhs, rhs)); 484 } 485 --- 486 487 References: 488 A Computer Method for Calculating Kendall's Tau with Ungrouped Data, 489 William R. Knight, Journal of the American Statistical Association, Vol. 490 61, No. 314, Part 1 (Jun., 1966), pp. 436-439 491 492 The Variance of Tau When Both Rankings Contain Ties. M.G. Kendall. 493 Biometrika, Vol 34, No. 3/4 (Dec., 1947), pp. 297-298 494 */ 495 double kendallCor(T, U)(T input1, U input2) 496 if(isInputRange!(T) && isInputRange!(U)) { 497 static if(isArray!(T) && isArray!(U)) { 498 dstatsEnforce(input1.length == input2.length, 499 "Ranges must be same length for Kendall correlation."); 500 if(input1.length <= kendallSmallN) { 501 return kendallCorSmallN(input1, input2); 502 } 503 } 504 505 auto alloc = newRegionAllocator(); 506 507 auto prepare(V)(V range) { 508 static if(isDefaultSorted!V) { 509 return range; 510 } else { 511 return prepareForSorting!compFun(alloc.array(range)); 512 } 513 } 514 515 try { 516 auto i1d = prepare(input1); 517 auto i2d = prepare(input2); 518 519 dstatsEnforce(i1d.length == i2d.length, 520 "Ranges must be same length for Kendall correlation."); 521 522 if(i1d.length <= kendallSmallN) { 523 return kendallCorSmallN(i1d, i2d); 524 } else { 525 return kendallCorDestructive(i1d, i2d); 526 } 527 } catch(SortException) { 528 return double.nan; 529 } 530 } 531 532 /** 533 Kendall's Tau-b O(N log N), overwrites input arrays with undefined data but 534 uses only O(log N) stack space for sorting, not O(N) space to duplicate 535 input. R1 and R2 must be either SortedRange structs with the default predicate 536 or arrays. 537 */ 538 double kendallCorDestructive(R1, R2)(R1 input1, R2 input2) { 539 dstatsEnforce(input1.length == input2.length, 540 "Ranges must be same length for Kendall correlation."); 541 try { 542 return kendallCorDestructiveLowLevel(input1, input2, false).tau; 543 } catch(SortException) { 544 return double.nan; 545 } 546 } 547 548 //bool compFun(T)(T lhs, T rhs) { return lhs < rhs; } 549 private enum compFun = "a < b"; 550 551 // Make sure arguments are in the right order, etc. to simplify implementation 552 // an dallow buffer recycling. 553 auto kendallCorDestructiveLowLevel(R1, R2) 554 (R1 input1, R2 input2, bool needTies) { 555 static if(isDefaultSorted!R1) { 556 return kendallCorDestructiveLowLevelImpl(input1, input2, needTies); 557 } else static if(isDefaultSorted!R2) { 558 return kendallCorDestructiveLowLevelImpl(input2, input1, needTies); 559 } else { 560 static assert(isArray!R1); 561 static assert(isArray!R2); 562 alias typeof(R1.init[0]) T; 563 alias typeof(R2.init[0]) U; 564 565 static if(T.sizeof > U.sizeof) { 566 return kendallCorDestructiveLowLevelImpl(input1, input2, needTies); 567 } else { 568 return kendallCorDestructiveLowLevelImpl(input2, input1, needTies); 569 } 570 } 571 } 572 573 struct KendallLowLevel { 574 double tau; 575 long s; 576 577 // Notation as in Kendall, 1947, Biometrika 578 579 ulong tieCorrectT1; // sum{t(t - 1)(2t + 5)} 580 ulong tieCorrectT2; // sum{t(t - 1)(t - 2)} 581 ulong tieCorrectT3; // sum{t(t - 1)} 582 583 ulong tieCorrectU1; // sum{u(u - 1)(2u + 5)} 584 ulong tieCorrectU2; // sum{u(u - 1)(u - 2)} 585 ulong tieCorrectU3; // sum{u(u - 1)} 586 } 587 588 package KendallLowLevel kendallCorDestructiveLowLevelImpl 589 (R1, R2)(R1 input1, R2 input2, bool needTies) 590 in { 591 assert(input1.length == input2.length); 592 } body { 593 static ulong getMs(V)(V data) { //Assumes data is sorted. 594 ulong Ms = 0, tieCount = 0; 595 foreach(i; 1..data.length) { 596 if(data[i] == data[i - 1]) { 597 tieCount++; 598 } else if(tieCount) { 599 Ms += (tieCount * (tieCount + 1)) / 2; 600 tieCount = 0; 601 } 602 } 603 if(tieCount) { 604 Ms += (tieCount * (tieCount + 1)) / 2; 605 } 606 return Ms; 607 } 608 609 void computeTies(V) 610 (V arr, ref ulong tie1, ref ulong tie2, ref ulong tie3) { 611 if(!needTies) { 612 return; // If only computing correlation, this is a waste of time. 613 } 614 615 ulong tieCount = 1; 616 foreach(i; 1..arr.length) { 617 if(arr[i] == arr[i - 1]) { 618 tieCount++; 619 } else if(tieCount > 1) { 620 tie1 += tieCount * (tieCount - 1) * (2 * tieCount + 5); 621 tie2 += tieCount * (tieCount - 1) * (tieCount - 2); 622 tie3 += tieCount * (tieCount - 1); 623 tieCount = 1; 624 } 625 } 626 627 // Handle last run. 628 if(tieCount > 1) { 629 tie1 += tieCount * (tieCount - 1) * (2 * tieCount + 5); 630 tie2 += tieCount * (tieCount - 1) * (tieCount - 2); 631 tie3 += tieCount * (tieCount - 1); 632 } 633 } 634 635 ulong m1 = 0; 636 ulong nPair = (cast(ulong) input1.length * 637 ( cast(ulong) input1.length - 1UL)) / 2UL; 638 KendallLowLevel ret; 639 ret.s = to!long(nPair); 640 641 static if(!isDefaultSorted!R1) { 642 qsort!(compFun)(input1, input2); 643 } 644 645 uint tieCount = 0; 646 foreach(i; 1..input1.length) { 647 if(input1[i] == input1[i - 1]) { 648 tieCount++; 649 } else if(tieCount > 0) { 650 static if(!isDefaultSorted!R2) { 651 qsort!(compFun)(input2[i - tieCount - 1..i]); 652 } 653 m1 += tieCount * (tieCount + 1UL) / 2UL; 654 ret.s += getMs(input2[i - tieCount - 1..i]); 655 tieCount = 0; 656 } 657 } 658 if(tieCount > 0) { 659 static if(!isDefaultSorted!R2) { 660 qsort!(compFun)(input2[input1.length - tieCount - 1..input1.length]); 661 } 662 m1 += tieCount * (tieCount + 1UL) / 2UL; 663 ret.s += getMs(input2[input1.length - tieCount - 1..input1.length]); 664 } 665 666 computeTies(input1, ret.tieCorrectT1, ret.tieCorrectT2, ret.tieCorrectT3); 667 668 static if(isArray!R2) { 669 ulong swapCount = 0; 670 static if(isArray!R1) { 671 // We've already guaranteed that input1 is the bigger array by 672 // bytes and we own these arrays and won't use input1 again, so 673 // this is safe. 674 alias typeof(input2[0]) U; 675 U[] input1Temp = (cast(U*) input1.ptr)[0..input2.length]; 676 mergeSortTemp!(compFun)(input2, input1Temp, &swapCount); 677 } else { 678 mergeSort!(compFun)(input2, &swapCount); 679 } 680 } else { 681 enum ulong swapCount = 0; // If they're both sorted then tau == 1. 682 } 683 684 immutable m2 = getMs(input2); 685 computeTies(input2, ret.tieCorrectU1, ret.tieCorrectU2, ret.tieCorrectU3); 686 687 ret.s -= (m1 + m2) + 2 * swapCount; 688 immutable double denominator1 = nPair - m1; 689 immutable double denominator2 = nPair - m2; 690 ret.tau = ret.s / sqrt(denominator1) / sqrt(denominator2); 691 return ret; 692 } 693 694 /* Kendall's Tau correlation, O(N^2) version. This is faster than the 695 * more asymptotically efficient version for N <= about 15, and is also useful 696 * for testing. Yes, the sorts for the large N impl fall back on insertion 697 * sorting for moderately small N, but due to additive constants and O(N) terms 698 * this algorithm is still faster for very small N. (Besides, I can't 699 * delete it anyhow because I need it for testing.) 700 */ 701 private double kendallCorSmallN(R1, R2)(R1 input1, R2 input2) 702 in { 703 assert(input1.length == input2.length); 704 705 // This function should never be used for any inputs even close to this 706 // large because it's a small-N optimization and a more efficient 707 // implementation exists in this module for large N, but when N gets this 708 // large it's not even correct due to overflow errors. 709 assert(input1.length < 1 << 15); 710 } body { 711 int m1 = 0, m2 = 0; 712 int s = 0; 713 714 foreach(i; 0..input2.length) { 715 foreach (j; i + 1..input2.length) { 716 if(input2[i] > input2[j]) { 717 if (input1[i] > input1[j]) { 718 s++; 719 } else if(input1[i] < input1[j]) { 720 s--; 721 } else if(input1[i] == input1[j]) { 722 m1++; 723 } else { 724 return double.nan; 725 } 726 } else if(input2[i] < input2[j]) { 727 if (input1[i] > input1[j]) { 728 s--; 729 } else if(input1[i] < input1[j]) { 730 s++; 731 } else if(input1[i] == input1[j]) { 732 m1++; 733 } else { 734 return double.nan; 735 } 736 } else if(input2[i] == input2[j]) { 737 m2++; 738 739 if(input1[i] < input1[j]) { 740 } else if(input1[i] > input1[j]) { 741 } else if(input1[i] == input1[j]) { 742 m1++; 743 } else { 744 return double.nan; 745 } 746 747 } else { 748 return double.nan; 749 } 750 } 751 } 752 753 immutable nCombination = input2.length * (input2.length - 1) / 2; 754 immutable double denominator1 = nCombination - m1; 755 immutable double denominator2 = nCombination - m2; 756 return s / sqrt(denominator1) / sqrt(denominator2); 757 } 758 759 760 unittest { 761 //Test against known values. 762 assert(approxEqual(kendallCor([1,2,3,4,5].dup, [3,1,7,4,3].dup), 0.1054093)); 763 assert(approxEqual(kendallCor([3,6,7,35,75].dup,[1,63,53,67,3].dup), 0.2)); 764 assert(approxEqual(kendallCor([1.5,6.3,7.8,4.2,1.5].dup, [1,63,53,67,3].dup), .3162287)); 765 766 static void doKendallTest(T)() { 767 T[] one = new T[1000], two = new T[1000]; 768 // Test complex, fast implementation against straightforward, 769 // slow implementation. 770 foreach(i; 0..100) { 771 size_t lowerBound = uniform(0, 1000); 772 size_t upperBound = uniform(0, 1000); 773 if(lowerBound > upperBound) swap(lowerBound, upperBound); 774 foreach(ref o; one) { 775 o = uniform(cast(T) -10, cast(T) 10); 776 } 777 foreach(ref o; two) { 778 o = uniform(cast(T) -10, cast(T) 10); 779 } 780 double kOne = 781 kendallCor(one[lowerBound..upperBound], two[lowerBound..upperBound]); 782 double kTwo = 783 kendallCorSmallN(one[lowerBound..upperBound], two[lowerBound..upperBound]); 784 assert(approxEqual(kOne, kTwo) || (isNaN(kOne) && isNaN(kTwo))); 785 } 786 } 787 788 doKendallTest!int(); 789 doKendallTest!float(); 790 doKendallTest!double(); 791 792 // Make sure everything works with lowest common denominator range type. 793 static struct Count { 794 uint num; 795 uint upTo; 796 @property size_t front() { 797 return num; 798 } 799 void popFront() { 800 num++; 801 } 802 @property bool empty() { 803 return num >= upTo; 804 } 805 } 806 807 Count a, b; 808 a.upTo = 100; 809 b.upTo = 100; 810 assert(approxEqual(kendallCor(a, b), 1)); 811 812 // This test will fail if there are overflow bugs, especially in tie 813 // handling. 814 auto rng = chain(repeat(0, 100_000), repeat(1, 100_000)); 815 assert(approxEqual(kendallCor(rng, rng), 1)); 816 817 // Test the case where we have one range sorted already. 818 assert(kendallCor(iota(5), [3, 1, 2, 5, 4]) == 819 kendallCor(assumeSorted(iota(5)), [3, 1, 2, 5, 4]) 820 ); 821 822 assert(kendallCor(iota(5), [3, 1, 2, 5, 4]) == 823 kendallCor([3, 1, 2, 5, 4], assumeSorted(iota(5))) 824 ); 825 826 assert(approxEqual( 827 kendallCor(assumeSorted(iota(5)), assumeSorted(iota(5))), 1 828 )); 829 830 auto lhs = randArray!rNormal(1_000, 0, 1); 831 auto indices = new size_t[1_000]; 832 import std.algorithm; 833 makeIndex(lhs, indices); 834 835 foreach(i; 0..1_000) { 836 auto rhs = randArray!rNormal(1_000, 0, 1); 837 auto lhsSorted = assumeSorted( 838 indexed(lhs, indices) 839 ); 840 841 // Rearrange rhs according to the sorting permutation of lhs. 842 // kendallCor(lhsSorted, rhsRearranged) will be much faster than 843 // kendallCor(lhs, rhs). 844 auto rhsRearranged = indexed(rhs, indices); 845 assert(kendallCor(lhsSorted, rhsRearranged) == kendallCor(lhs, rhs)); 846 } 847 } 848 849 // Alias to old correlation function names, but don't document them. These will 850 // eventually be deprecated. 851 alias PearsonCor Pcor; 852 alias pearsonCor pcor; 853 alias spearmanCor scor; 854 alias kendallCor kcor; 855 alias kendallCorDestructive kcorDestructive; 856 857 /**Computes the partial correlation between vec1, vec2 given 858 * conditions. conditions can be either a tuple of ranges, a range of ranges, 859 * or (for a single condition) a single range. 860 * 861 * cor is the correlation metric to use. It can be either pearsonCor, 862 * spearmanCor, kendallCor, or any custom correlation metric you can come up 863 * with. 864 * 865 * Examples: 866 * --- 867 * uint[] stock1Price = [8, 6, 7, 5, 3, 0, 9]; 868 * uint[] stock2Price = [3, 1, 4, 1, 5, 9, 2]; 869 * uint[] economicHealth = [2, 7, 1, 8, 2, 8, 1]; 870 * uint[] consumerFear = [1, 2, 3, 4, 5, 6, 7]; 871 * 872 * // See whether the prices of stock 1 and stock 2 are correlated even 873 * // after adjusting for the overall condition of the economy and consumer 874 * // fear. 875 * double partialCor = 876 * partial!pearsonCor(stock1Price, stock2Price, economicHealth, consumerFear); 877 * --- 878 */ 879 double partial(alias cor, T, U, V...)(T vec1, U vec2, V conditionsIn) 880 if(isInputRange!T && isInputRange!U && allSatisfy!(isInputRange, V)) { 881 auto alloc = newRegionAllocator(); 882 static if(V.length == 1 && isInputRange!(ElementType!(V[0]))) { 883 // Range of ranges. 884 static if(isArray!(V[0])) { 885 alias conditionsIn[0] cond; 886 } else { 887 auto cond = tempdup(cond[0]); 888 } 889 } else { 890 alias conditionsIn cond; 891 } 892 893 auto corMatrix = doubleMatrix(cond.length + 2, cond.length + 2, alloc); 894 foreach(i; 0..corMatrix.rows) corMatrix[i, i] = 1; 895 896 corMatrix[0, 1] = corMatrix[1, 0] = cast(double) cor(vec1, vec2); 897 foreach(i, condition; cond) { 898 immutable conditionIndex = i + 2; 899 corMatrix[0, conditionIndex] = cast(double) cor(vec1, condition); 900 corMatrix[conditionIndex, 0] = corMatrix[0, conditionIndex]; 901 corMatrix[1, conditionIndex] = cast(double) cor(vec2, condition); 902 corMatrix[conditionIndex, 1] = corMatrix[1, conditionIndex]; 903 } 904 905 foreach(i, condition1; cond) { 906 foreach(j, condition2; cond[i + 1..$]) { 907 immutable index1 = i + 2; 908 immutable index2 = index1 + j + 1; 909 corMatrix[index1, index2] = cast(double) cor(condition1, condition2); 910 corMatrix[index2, index1] = corMatrix[index1, index2]; 911 } 912 } 913 914 auto invMatrix = doubleMatrix(cond.length + 2, cond.length + 2, alloc); 915 invert(corMatrix, invMatrix); 916 917 // This code is written so verbosely to work around a compiler segfault 918 // that I have no idea how to reduce. 919 immutable denom = sqrt(invMatrix[0, 0] * invMatrix[1, 1]); 920 immutable negNumer = invMatrix[0, 1]; 921 return -negNumer / denom; 922 } 923 924 unittest { 925 // values from Matlab. 926 uint[] stock1Price = [8, 6, 7, 5, 3, 0, 9]; 927 uint[] stock2Price = [3, 1, 4, 1, 5, 9, 2]; 928 uint[] economicHealth = [2, 7, 1, 8, 2, 8, 1]; 929 uint[] consumerFear = [1, 2, 3, 4, 5, 6, 7]; 930 double partialCor = 931 partial!pearsonCor(stock1Price, stock2Price, [economicHealth, consumerFear][]); 932 assert(approxEqual(partialCor, -0.857818)); 933 934 double spearmanPartial = 935 partial!spearmanCor(stock1Price, stock2Price, economicHealth, consumerFear); 936 assert(approxEqual(spearmanPartial, -0.7252)); 937 } 938 939 private __gshared TaskPool emptyPool; 940 shared static this() { 941 emptyPool = new TaskPool(0); 942 } 943 944 private struct RorToMatrix(RoR) { 945 RoR* ror; 946 947 auto ref opIndex(size_t i, size_t j) { 948 return (*ror)[i][j]; 949 } 950 951 void opIndexAssign(typeof((*ror)[0][0]) val, size_t i, size_t j) { 952 (*ror)[i][j] = val; 953 } 954 955 size_t rows() @property { 956 return (*ror).length; 957 } 958 } 959 960 private auto makeMatrixLike(RoR)(ref RoR ror) { 961 return RorToMatrix!RoR(&ror); 962 } 963 964 private template isMatrixLike(T) { 965 enum isMatrixLike = is(typeof({ 966 T t; 967 size_t r = t.rows; 968 t[0, 0] = 2.0; 969 })); 970 } 971 972 version(scid) { 973 import scid.matrix; 974 975 /** 976 These functions allow efficient calculation of the Pearson, Spearman and 977 Kendall correlation matrices and the covariance matrix respectively. They 978 are optimized to avoid computing certain values multiple times when computing 979 correlations of one vector to several others. 980 981 Note: These functions are only available when SciD is installed and 982 dstats is compiled with version=scid. 983 984 Params: 985 986 mat = A range of ranges to be treated as a set of vectors. This must be 987 a finite range, and must be rectangular, i.e. all elements must be 988 the same length. For Pearson correlation and covariance, the ranges 989 must also have elements implicitly convertible to double. 990 SciD matrices can be treated as ranges of ranges and can be used with 991 these functions. 992 993 taskPool = A std.parallelism.TaskPool used to parallelize the computation. 994 This is useful for very large matrices. If none is provided, 995 the computation will not be parallelized. 996 997 998 Examples: 999 --- 1000 auto input = [[8.0, 6, 7, 5], 1001 [3.0, 0, 9, 3], 1002 [1.0, 4, 1, 5]]; 1003 auto pearson = pearsonMatrix(input); 1004 assert(approxEqual(pearson[0, 0], 1)); 1005 --- 1006 */ 1007 SymmetricMatrix!double pearsonMatrix(RoR)(RoR mat, TaskPool pool = null) 1008 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && 1009 is(ElementType!(ElementType!RoR) : double) && 1010 !isInfinite!RoR 1011 ) { 1012 SymmetricMatrix!double ret; 1013 pearsonSpearmanCov!true(mat, pool, CorCovType.pearson, ret); 1014 return ret; 1015 } 1016 1017 /// Ditto 1018 SymmetricMatrix!double spearmanMatrix(RoR)(RoR mat, TaskPool pool = null) 1019 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && !isInfinite!RoR) { 1020 SymmetricMatrix!double ret; 1021 pearsonSpearmanCov!true(mat, pool, CorCovType.spearman, ret); 1022 return ret; 1023 } 1024 1025 /// Ditto 1026 SymmetricMatrix!double kendallMatrix(RoR)(RoR mat, TaskPool pool = null) 1027 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && !isInfinite!RoR) { 1028 SymmetricMatrix!double ret; 1029 kendallMatrixImpl!true(mat, ret, pool); 1030 return ret; 1031 } 1032 1033 /// Ditto 1034 SymmetricMatrix!double covarianceMatrix(RoR)(RoR mat, TaskPool pool = null) 1035 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && 1036 is(ElementType!(ElementType!RoR) : double) && 1037 !isInfinite!RoR 1038 ) { 1039 SymmetricMatrix!double ret; 1040 pearsonSpearmanCov!true(mat, pool, CorCovType.covariance, ret); 1041 return ret; 1042 } 1043 } 1044 1045 /** 1046 These overloads allow for correlation and covariance matrices to be computed 1047 with the results being stored in a pre-allocated variable, ans. ans must 1048 be either a SciD matrix or a random-access range of ranges with assignable 1049 elements of a floating point type. It must have the same number of rows 1050 as the number of vectors in mat and must have at least enough columns in 1051 each row to support storing the lower triangle. If ans is a full rectangular 1052 matrix/range of ranges, only the lower triangle results will be stored. 1053 1054 Note: These functions can be used without SciD because they don't return 1055 SciD types. 1056 1057 Examples: 1058 --- 1059 auto pearsonRoR = [[0.0], [0.0, 0.0], [0.0, 0.0, 0.0]]; 1060 pearsonMatrix(input, pearsonRoR); 1061 assert(approxEqual(pearsonRoR[1][1], 1)); 1062 --- 1063 */ 1064 void pearsonMatrix(RoR, Ret)(RoR mat, ref Ret ans, TaskPool pool = null) 1065 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && 1066 is(ElementType!(ElementType!RoR) : double) && 1067 !isInfinite!RoR && 1068 (is(typeof(ans[0, 0] = 2.0)) || is(typeof(ans[0][0] = 2.0))) 1069 ) { 1070 static if(isMatrixLike!Ret) { 1071 alias ans ret; 1072 } else { 1073 auto ret = makeMatrixLike(ans); 1074 } 1075 1076 pearsonSpearmanCov!false(mat, pool, CorCovType.pearson, ret); 1077 } 1078 1079 /// Ditto 1080 void spearmanMatrix(RoR, Ret)(RoR mat, ref Ret ans, TaskPool pool = null) 1081 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && !isInfinite!RoR && 1082 (is(typeof(ans[0, 0] = 2.0)) || is(typeof(ans[0][0] = 2.0))) 1083 ) { 1084 static if(isMatrixLike!Ret) { 1085 alias ans ret; 1086 } else { 1087 auto ret = makeMatrixLike(ans); 1088 } 1089 1090 pearsonSpearmanCov!false(mat, pool, CorCovType.spearman, ret); 1091 } 1092 1093 /// Ditto 1094 void kendallMatrix(RoR, Ret)(RoR mat, ref Ret ans, TaskPool pool = null) 1095 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && !isInfinite!RoR && 1096 (is(typeof(ans[0, 0] = 2.0)) || is(typeof(ans[0][0] = 2.0))) 1097 ) { 1098 static if(isMatrixLike!Ret) { 1099 alias ans ret; 1100 } else { 1101 auto ret = makeMatrixLike(ans); 1102 } 1103 1104 kendallMatrixImpl!false(mat, ret, pool); 1105 } 1106 1107 /// Ditto 1108 void covarianceMatrix(RoR, Ret)(RoR mat, ref Ret ans, TaskPool pool = null) 1109 if(isInputRange!RoR && isInputRange!(ElementType!RoR) && 1110 is(ElementType!(ElementType!RoR) : double) && 1111 !isInfinite!RoR && 1112 (is(typeof(ans[0, 0] = 2.0)) || is(typeof(ans[0][0] = 2.0))) 1113 ) { 1114 static if(isMatrixLike!Ret) { 1115 alias ans ret; 1116 } else { 1117 auto ret = makeMatrixLike(ans); 1118 } 1119 1120 pearsonSpearmanCov!false(mat, pool, CorCovType.covariance, ret); 1121 } 1122 1123 private void kendallMatrixImpl(bool makeNewMatrix, RoR, Matrix) 1124 (RoR mat, ref Matrix ret, TaskPool pool = null) { 1125 if(pool is null) { 1126 pool = emptyPool; 1127 } 1128 1129 auto alloc = newRegionAllocator(); 1130 alias ElementType!RoR R; 1131 alias ElementType!R E; 1132 1133 static if(!isRandomAccessRange!R || !isForwardRange!RoR) { 1134 auto randomMat = alloc.array(mat 1135 .map!(r => prepareForSorting(alloc.array(r)))); 1136 } else { 1137 alias mat randomMat; 1138 } 1139 1140 if(randomMat.empty) return; 1141 immutable nElems = randomMat.front.length; 1142 1143 foreach(row; randomMat.save) { 1144 dstatsEnforce(row.length == nElems, 1145 "Range of ranges must be rectangular for kendallMatrix." 1146 ); 1147 } 1148 1149 static if(makeNewMatrix) { 1150 static assert(is(typeof(ret) == SymmetricMatrix!double)); 1151 ret = SymmetricMatrix!double(walkLength(randomMat)); 1152 } 1153 1154 // HACK: Before the multithreaded portion of this algorithm 1155 // starts, make sure that there's no need to unshare ret if it's 1156 // using ref-counted COW semantics. 1157 ret[0, 0] = 0; 1158 1159 foreach(i, row1; pool.parallel(randomMat.save, 1)) { 1160 auto alloc2 = newRegionAllocator(); 1161 auto indices = alloc2.uninitializedArray!(size_t[])(row1.length); 1162 foreach(ii, ref elem; indices) elem = ii; 1163 1164 bool less(size_t a, size_t b) { 1165 return row1[a] < row1[b]; 1166 } 1167 1168 qsort!less(indices); 1169 auto row1Sorted = assumeSorted(indexed(row1, indices)); 1170 1171 size_t j = 0; 1172 foreach(row2; randomMat.save) { 1173 scope(exit) j++; 1174 if(i == j) { 1175 ret[i, i] = 1; 1176 break; 1177 } 1178 1179 auto row2Rearranged = indexed(row2, indices); 1180 ret[i, j] = kendallCor(row1Sorted, row2Rearranged); 1181 } 1182 } 1183 } 1184 1185 private enum CorCovType { 1186 pearson, 1187 spearman, 1188 covariance 1189 } 1190 1191 private void pearsonSpearmanCov(bool makeNewMatrix, RoR, Matrix) 1192 (RoR mat, TaskPool pool, CorCovType type, ref Matrix ret) { 1193 import dstats.summary : mean; 1194 if(pool is null) { 1195 pool = emptyPool; 1196 } 1197 1198 auto alloc = newRegionAllocator(); 1199 1200 auto normalized = alloc.array(mat 1201 .map!(r => alloc.array(r.map!(to!double)))); 1202 1203 foreach(row; normalized[1..$]) { 1204 dstatsEnforce(row.length == normalized[0].length, 1205 "Matrix must be rectangular for pearsonMatrix."); 1206 } 1207 1208 immutable double nCols = (normalized.length) ? (normalized[0].length) : 0; 1209 1210 final switch(type) { 1211 case CorCovType.pearson: 1212 foreach(row; pool.parallel(normalized)) { 1213 immutable msd = meanStdev(row); 1214 row[] = (row[] - msd.mean) / sqrt(msd.mse * nCols); 1215 } 1216 1217 break; 1218 case CorCovType.covariance: 1219 immutable divisor = sqrt(nCols - 1.0); 1220 foreach(row; pool.parallel(normalized)) { 1221 immutable mu = mean(row).mean; 1222 row[] -= mu; 1223 row[] /= divisor; 1224 } 1225 break; 1226 case CorCovType.spearman: 1227 foreach(row; pool.parallel(normalized)) { 1228 auto alloc = newRegionAllocator(); 1229 auto buf = alloc.uninitializedArray!(double[])(row.length); 1230 rank(row, buf); 1231 1232 // Need to find mean, stdev separately for every row b/c 1233 // of ties. 1234 immutable msd = meanStdev(buf); 1235 row[] = (buf[] - msd.mean) / sqrt(msd.mse * nCols); 1236 } 1237 1238 break; 1239 } 1240 1241 static if(makeNewMatrix) { 1242 static assert(is(typeof(ret) == SymmetricMatrix!double)); 1243 ret = SymmetricMatrix!double(normalized.length); 1244 } 1245 1246 dotMatrix(normalized, ret, pool); 1247 } 1248 1249 // This uses an array-of-arrays to avoid heap fragmentation issues with large 1250 // matrices and because the loss of efficiency from poitner chasing is 1251 // negligible given that we always access them one row at a time. 1252 private void dotMatrix(Matrix)( 1253 double[][] rows, 1254 ref Matrix ret, 1255 TaskPool pool 1256 ) in { 1257 foreach(row; rows[1..$]) { 1258 assert(row.length == rows[0].length); 1259 } 1260 1261 assert(ret.rows == rows.length); 1262 } body { 1263 // HACK: Before the multithreaded portion of this algorithm 1264 // starts, make sure that there's no need to unshare ret if it's 1265 // using ref-counted COW semantics. 1266 ret[0, 0] = 0; 1267 1268 foreach(i; pool.parallel(iota(0, rows.length), 1)) { 1269 auto row1 = rows[i]; 1270 1271 foreach(j; 0..i + 1) { 1272 ret[i, j] = dotProduct(row1, rows[j]); 1273 } 1274 } 1275 } 1276 1277 unittest { 1278 auto input = [[8.0, 6, 7, 5], 1279 [3.0, 0, 9, 3], 1280 [1.0, 4, 1, 5]]; 1281 1282 1283 static double[][] makeRoR() { 1284 return [[0.0], [0.0, 0.0], [0.0, 0.0, 0.0]]; 1285 } 1286 1287 auto pearsonRoR = makeRoR(); 1288 pearsonMatrix(input, pearsonRoR); 1289 1290 auto spearmanRoR = makeRoR(); 1291 spearmanMatrix(input, spearmanRoR); 1292 1293 auto kendallRoR = makeRoR(); 1294 kendallMatrix(input, kendallRoR); 1295 1296 auto covRoR = makeRoR(); 1297 covarianceMatrix(input, covRoR); 1298 1299 // Values from R. 1300 1301 alias approxEqual ae; // Save typing. 1302 assert(ae(pearsonRoR[0][0], 1)); 1303 assert(ae(pearsonRoR[1][1], 1)); 1304 assert(ae(pearsonRoR[2][2], 1)); 1305 assert(ae(pearsonRoR[1][0], 0.3077935)); 1306 assert(ae(pearsonRoR[2][0], -0.9393364)); 1307 assert(ae(pearsonRoR[2][1], -0.6103679)); 1308 1309 assert(ae(spearmanRoR[0][0], 1)); 1310 assert(ae(spearmanRoR[1][1], 1)); 1311 assert(ae(spearmanRoR[2][2], 1)); 1312 assert(ae(spearmanRoR[1][0], 0.3162278)); 1313 assert(ae(spearmanRoR[2][0], -0.9486833)); 1314 assert(ae(spearmanRoR[2][1], -0.5)); 1315 1316 assert(ae(kendallRoR[0][0], 1)); 1317 assert(ae(kendallRoR[1][1], 1)); 1318 assert(ae(kendallRoR[2][2], 1)); 1319 assert(ae(kendallRoR[1][0], 0.1825742)); 1320 assert(ae(kendallRoR[2][0], -0.9128709)); 1321 assert(ae(kendallRoR[2][1], -0.4)); 1322 1323 assert(ae(covRoR[0][0], 1.66666)); 1324 assert(ae(covRoR[1][1], 14.25)); 1325 assert(ae(covRoR[2][2], 4.25)); 1326 assert(ae(covRoR[1][0], 1.5)); 1327 assert(ae(covRoR[2][0], -2.5)); 1328 assert(ae(covRoR[2][1], -4.75)); 1329 1330 version(scid) { 1331 static bool test(double[][] a, SymmetricMatrix!double b) { 1332 foreach(i; 0..3) foreach(j; 0..i + 1) { 1333 if(!ae(a[i][j], b[i, j])) return false; 1334 } 1335 1336 return true; 1337 } 1338 1339 auto pearson = pearsonMatrix(input, taskPool); 1340 auto spearman = spearmanMatrix(input, taskPool); 1341 auto kendall = kendallMatrix(input, taskPool); 1342 auto cov = covarianceMatrix(input, taskPool); 1343 1344 test(pearsonRoR, pearson); 1345 test(spearmanRoR, spearman); 1346 test(kendallRoR, kendall); 1347 test(covRoR, cov); 1348 } 1349 }