dstats.pca source code

1 /**
2 This module contains a basic implementation of principal component analysis,
3 based on the NIPALS algorithm.  This is fast when you only need the first
4 few components (which is usually the case since PCA's main uses are
5 visualization and dimensionality reduction).  However, convergence slows
6 drastically after the first few components have been removed and most of
7 the matrix is just noise.
8 
9 References:
10 
11 en.wikipedia.org/wiki/Principal_component_analysis#Computing_principal_components_iteratively
12 
13 Author:  David Simcha
14 */
15 
16 /*
17  * License:
18  * Boost Software License - Version 1.0 - August 17th, 2003
19  *
20  * Permission is hereby granted, free of charge, to any person or organization
21  * obtaining a copy of the software and accompanying documentation covered by
22  * this license (the "Software") to use, reproduce, display, distribute,
23  * execute, and transmit the Software, and to prepare derivative works of the
24  * Software, and to permit third-parties to whom the Software is furnished to
25  * do so, all subject to the following:
26  *
27  * The copyright notices in the Software and this entire statement, including
28  * the above license grant, this restriction and the following disclaimer,
29  * must be included in all copies of the Software, in whole or in part, and
30  * all derivative works of the Software, unless such copies or derivative
31  * works are solely in the form of machine-executable object code generated by
32  * a source language processor.
33  *
34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
37  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
38  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
39  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
40  * DEALINGS IN THE SOFTWARE.
41  */
42 module dstats.pca;
43 
44 import std.range, dstats.base, dstats.alloc, std.numeric, std.stdio, std.math,
45     std.algorithm, std.array, dstats.summary, dstats.random, std.conv,
46     std.exception, dstats.regress, std.traits;
47 
48 /// Result holder
49 struct PrincipalComponent {
50     /// The projection of the data onto the first principal component.
51     double[] x;
52 
53     /// The vector representing the first principal component loadings.
54     double[] rotation;
55 }
56 
57 /**
58 Sets options for principal component analysis.  The default options are
59 also the values in PrinCompOptions.init.
60 */
61 struct PrinCompOptions {
62     ///  Center each column to zero mean.  Default value:  true.
63     bool zeroMean = true;
64 
65     /**
66     Scale each column to unit variance.  Note that, if this option is set to
67     true, zeroMean is ignored and the mean of each column is set to zero even
68     if zeroMean is false.  Default value:  false.
69     */
70     bool unitVariance = false;
71 
72     /**
73     Overwrite input matrix instead of copying.  Ignored if the matrix
74     passed in does not have assignable, lvalue elements and centering or
75     scaling is enabled.  Default value:  false.
76     */
77     bool destructive = false;
78 
79     /**
80     Effectively transpose the matrix.  If enabled, treat each column as a
81     data points and each row as a dimension.  If disabled, do the opposite.
82     Note that, if this is enabled, each row will be scaled and centered,
83     not each column.  Default value:  false.
84     */
85     bool transpose = false;
86 
87     /**
88     Relative error at which to stop the optimization procedure.  Default: 1e-4
89     */
90     double relError = 1.0e-4;
91 
92     /**
93     Absolute error at which to stop the optimization procedure.  Default:  1e-5
94     */
95     double absError = 1.0e-5;
96 
97     /**
98     Maximum iterations for the optimization procedure.  After this many
99     iterations, the algorithm gives up and calls teh solution "good enough"
100     no matter what.  For exploratory analyses, "good enough" solutions
101     can be had fast sometimes by making this value small.  Default:  uint.max
102     */
103     uint maxIter = uint.max;
104 
105     private void doCenterScaleTransposed(R)(R data) {
106         foreach(row; data.save) {
107             immutable msd = meanStdev(row.save);
108 
109             foreach(ref elem; row) {
110                 // Already checked for whether we're supposed to be normalizing.
111                 elem -= msd.mean;
112                 if(unitVariance) elem /= msd.stdev;
113             }
114         }
115     }
116 
117     private void doCenterScale(R)(R data) {
118         if(!zeroMean && !unitVariance) return;
119         if(data.empty) {
120             return;
121         }
122 
123         if(transpose) return doCenterScaleTransposed(data);
124 
125         auto alloc = newRegionAllocator();
126         immutable rowLen = walkLength(data.front.save);
127 
128         auto summs = alloc.uninitializedArray!(MeanSD[])(rowLen);
129         summs[] = MeanSD.init;
130         foreach(row; data) {
131             size_t i = 0;
132             foreach(elem; row) {
133                 enforce(i < rowLen, "Matrix must be rectangular for PCA.");
134                 summs[i++].put(elem);
135             }
136 
137             enforce(i == rowLen, "Matrix must be rectangular for PCA.");
138         }
139 
140         foreach(row; data) {
141             size_t i = 0;
142             foreach(ref elem; row) {
143                 elem -= summs[i].mean;
144                 if(unitVariance) elem /= summs[i].stdev;
145                 i++;
146             }
147         }
148     }
149 }
150 
151 
152 /**
153 Uses expectation-maximization to compute the first principal component of mat.
154 Since there are a lot of options, they are controlled by a PrinCompOptions
155 struct.  (See above.  PrinCompOptions.init contains the default values.)
156 To have the results returned in a pre-allocated space, pass an explicit value
157 for buf.
158 */
159 PrincipalComponent firstComponent(Ror)(
160     Ror data,
161     PrinCompOptions opts = PrinCompOptions.init,
162     PrincipalComponent buf = PrincipalComponent.init
163 ) {
164     auto alloc = newRegionAllocator();
165 
166     PrincipalComponent doNonDestructive() {
167         double[][] dataFixed;
168 
169         if(opts.transpose) {
170             dataFixed = transposeDup(data, alloc);
171         } else {
172             dataFixed = doubleTempdupMatrix(data, alloc);
173         }
174 
175         opts.transpose = false;  // We already transposed if necessary.
176         opts.doCenterScale(dataFixed);
177         return firstComponentImpl(dataFixed, buf, opts);
178     }
179 
180     static if(!hasLvalueElements!(ElementType!Ror) ||
181     !hasAssignableElements!(ElementType!Ror)) {
182         if(opts.zeroMean || opts.unitVariance) {
183             return doNonDestructive();
184         } else {
185             return firstComponentImpl(data, buf, opts);
186         }
187     } else {
188         if(!opts.destructive) {
189             return doNonDestructive;
190         }
191 
192         opts.doCenterScale(data);
193         return firstComponentImpl(data, buf, opts);
194     }
195 }
196 
197 private PrincipalComponent firstComponentImpl(Ror)(
198     Ror data,
199     PrincipalComponent buf,
200     PrinCompOptions opts
201 ) {
202     auto alloc = newRegionAllocator();
203 
204     if(data.empty) return typeof(return).init;
205     size_t rowLen = walkLength(data.front.save);
206     size_t colLen = walkLength(data.save);
207 
208     immutable transposed = opts.transpose;
209     if(transposed) swap(rowLen, colLen);
210 
211     auto t = alloc.uninitializedArray!(double[])(rowLen);
212     auto p = (buf.rotation.length >= rowLen) ?
213               buf.rotation[0..rowLen] : new double[rowLen];
214     p[] = 1;
215 
216     bool approxEqualOrNotFinite(const double[] a, const double[] b) {
217         foreach(i; 0..a.length) {
218             if(!isFinite(a[i]) || !isFinite(b[i])) {
219                 return true;
220             } else if(!approxEqual(a[i], b[i], opts.relError, opts.absError)) {
221                 return false;
222             }
223         }
224 
225         return true;
226     }
227 
228     uint iter;
229     for(; iter < opts.maxIter; iter++) {
230         t[] = 0;
231 
232         if(transposed) {
233             auto dps = alloc.uninitializedArray!(double[])(colLen);
234             scope(exit) alloc.freeLast();
235             dps[] = 0;
236 
237             size_t i = 0;
238             foreach(row; data.save) {
239                 scope(exit) i++;
240 
241                 static if(is(typeof(row) : const(double)[])) {
242                     // Take advantage of array ops.
243                     dps[] += p[i] * row[];
244                 } else {
245                     size_t j = 0;
246                     foreach(elem; row) {
247                         scope(exit) j++;
248                         dps[j] += p[i] * elem;
249                     }
250                 }
251             }
252 
253             i = 0;
254             foreach(row; data.save) {
255                 scope(exit) i++;
256                 t[i] += dotProduct(row, dps);
257             }
258 
259         } else {
260             foreach(row; data.save) {
261                 immutable dp = dotProduct(p, row);
262                 static if( is(typeof(row) : const(double)[] )) {
263                     // Use array op optimization if possible.
264                     t[] += row[] * dp;
265                 } else {
266                     size_t i = 0;
267                     foreach(elem; row.save) {
268                         t[i++] += elem * dp;
269                     }
270                 }
271             }
272         }
273 
274         immutable tMagnitude = magnitude(t);
275         t[] /= tMagnitude;
276 
277         if(approxEqualOrNotFinite(t, p)) {
278             p[] = t[];
279             break;
280         }
281 
282         p[] = t[];
283     }
284 
285     auto x = (buf.x.length >= colLen) ?
286               buf.x[0..colLen] : new double[colLen];
287     size_t i = 0;
288 
289     if(transposed) {
290         x[] = 0;
291 
292         size_t rowIndex = 0;
293         foreach(row; data) {
294             scope(exit) rowIndex++;
295             size_t colIndex = 0;
296 
297             foreach(elem; row) {
298                 scope(exit) colIndex++;
299                 x[colIndex] += p[rowIndex] * elem;
300             }
301         }
302 
303     } else {
304         foreach(row; data) {
305             x[i++] = dotProduct(p, row);
306         }
307     }
308 
309     return PrincipalComponent(x, p);
310 }
311 
312 /// Used for removeComponent().
313 enum Transposed : bool {
314 
315     ///
316     yes = true,
317 
318     ///
319     no = false
320 }
321 
322 /**
323 Remove the principal component specified by the given rotation vector from
324 data.  data must have assignable elements.  Transposed controls whether
325 rotation is considered a loading for the transposed matrix or the matrix
326 as-is.
327 */
328 void removeComponent(Ror, R)(
329     Ror data,
330     R rotation,
331     Transposed transposed = Transposed.no
332 ) {
333     double[2] regressBuf;
334 
335     immutable rotMagNeg1 = 1.0 / magnitude(rotation.save);
336 
337     if(transposed) {
338         auto alloc = newRegionAllocator();
339         auto dps = alloc.uninitializedArray!(double[])
340             (walkLength(data.front.save));
341         dps[] = 0;
342 
343         auto r2 = rotation.save;
344         foreach(row; data.save) {
345             scope(exit) r2.popFront();
346 
347             size_t j = 0;
348 
349             foreach(elem; row) {
350                 scope(exit) j++;
351                 dps[j] += r2.front * elem;
352             }
353         }
354 
355         dps[] *= rotMagNeg1;
356 
357         r2 = rotation.save;
358         foreach(row; data.save) {
359             scope(exit) r2.popFront();
360 
361             auto rs = row.save;
362             for(size_t j = 0; !rs.empty; rs.popFront, j++) {
363                 rs.front = rs.front - r2.front * dps[j];
364             }
365         }
366 
367     } else {
368         foreach(row; data.save) {
369             immutable dotProd = dotProduct(rotation, row);
370             immutable coeff = dotProd * rotMagNeg1;
371 
372             auto rs = row.save;
373             auto rots = rotation.save;
374             while(!rs.empty && !rots.empty) {
375                 scope(exit) {
376                     rs.popFront();
377                     rots.popFront();
378                 }
379 
380                 rs.front = rs.front - rots.front * coeff;
381             }
382         }
383     }
384 }
385 
386 /**
387 Computes the first N principal components of the matrix.  More efficient than
388 calling firstComponent and removeComponent repeatedly because copying and
389 transposing, if enabled, only happen once.
390 */
391 PrincipalComponent[] firstNComponents(Ror)(
392     Ror data,
393     uint n,
394     PrinCompOptions opts = PrinCompOptions.init,
395     PrincipalComponent[] buf = null
396 ) {
397 
398     auto alloc = newRegionAllocator();
399 
400     PrincipalComponent[] doNonDestructive() {
401         double[][] dataFixed;
402 
403         if(opts.transpose) {
404             dataFixed = transposeDup(data, alloc);
405         } else {
406             dataFixed = doubleTempdupMatrix(data, alloc);
407         }
408 
409         opts.transpose = false;  // We already transposed if necessary.
410         opts.doCenterScale(dataFixed);
411         return firstNComponentsImpl(dataFixed, n, opts, buf);
412     }
413 
414     static if(!hasLvalueElements!(ElementType!Ror) ||
415     !hasAssignableElements!(ElementType!Ror)) {
416         return doNonDestructive();
417     } else {
418         if(!opts.destructive) {
419             return doNonDestructive();
420         }
421 
422         opts.doCenterScale(data);
423         return firstNComponentsImpl(data, n, opts, buf);
424     }
425 }
426 
427 private PrincipalComponent[] firstNComponentsImpl(Ror)(Ror data, uint n,
428     PrinCompOptions opts, PrincipalComponent[] buf = null) {
429 
430     opts.destructive = true;  // We already copied if necessary.
431     opts.unitVariance = false;  // Already did this.
432 
433     buf.length = n;
434     foreach(comp; 0..n) {
435         if(comp != 0) {
436             removeComponent(data, buf[comp - 1].rotation,
437                 cast(Transposed) opts.transpose);
438         }
439 
440         buf[comp] = firstComponent(data, opts, buf[comp]);
441     }
442 
443     return buf;
444 }
445 
446 private double magnitude(R)(R x) {
447     return sqrt(reduce!"a + b * b"(0.0, x));
448 }
449 
450 // Convert the matrix to a double[][].
451 double[] doubleTempdup(R)(R range, RegionAllocator alloc) {
452     return alloc.array(map!(to!double)(range));
453 }
454 
455 private double[][] doubleTempdupMatrix(R)(R data, RegionAllocator alloc) {
456     auto dataFixed = alloc.uninitializedArray!(double[][])
457         (data.length);
458     foreach(i, ref elem; dataFixed) {
459         elem = doubleTempdup(data[i], alloc);
460     }
461 
462     return dataFixed;
463 }
464 
465 private double[][] transposeDup(Ror)(Ror data, RegionAllocator alloc) {
466     if(data.empty) return null;
467 
468     immutable rowLen = walkLength(data.front.save);
469     immutable colLen = walkLength(data.save);
470     auto ret = alloc.uninitializedArray!(double[][])(rowLen, colLen);
471 
472     size_t i = 0;
473     foreach(row; data) {
474         scope(exit) i++;
475         if(i == colLen) break;
476 
477         size_t j = 0;
478         foreach(col; row) {
479             scope(exit) j++;
480             if(j == rowLen) break;
481             ret[j][i] = col;
482         }
483 
484         dstatsEnforce(j == rowLen, "Matrices must be rectangular for PCA.");
485     }
486 
487     dstatsEnforce(i == colLen, "Matrices must be rectangular for PCA.");
488     return ret;
489 }
490 
491 version(unittest) {
492     // There are two equally valid answers for PCA that differ only by sign.
493     // This tests whether one of them matches the test value.
494     bool plusMinusAe(T, U)(T lhs, U rhs) {
495         return approxEqual(lhs, rhs) || approxEqual(lhs, map!"-a"(rhs));
496     }
497 }
498 
499 unittest {
500     // Values from R's prcomp function.  Not testing the 4th component because
501     // it's mostly numerical fuzz.
502 
503     static double[][] getMat() {
504         return [[3,6,2,4], [3,6,8,8], [6,7,5,3], [0,9,3,1]];
505     }
506 
507     auto mat = getMat();
508     auto allComps = firstNComponents(mat, 3);
509 
510     assert(plusMinusAe(allComps[0].x, [1.19, -5.11, -0.537, 4.45]));
511     assert(plusMinusAe(allComps[0].rotation, [-0.314, 0.269, -0.584, -0.698]));
512 
513     assert(plusMinusAe(allComps[1].x, [0.805, -1.779, 2.882, -1.908]));
514     assert(plusMinusAe(allComps[1].rotation, [0.912, -0.180, -0.2498, -0.2713]));
515 
516     assert(plusMinusAe(allComps[2].x, [2.277, -0.1055, -1.2867, -0.8849]));
517     assert(plusMinusAe(allComps[2].rotation, [-0.1578, -0.5162, -0.704, 0.461]));
518 
519     auto comp1 = firstComponent(mat);
520     assert(plusMinusAe(comp1.x, allComps[0].x));
521     assert(plusMinusAe(comp1.rotation, allComps[0].rotation));
522 
523     // Test transposed.
524     PrinCompOptions opts;
525     opts.transpose = true;
526     const double[][] m2 = mat;
527     auto allCompsT = firstNComponents(m2, 3, opts);
528 
529     assert(plusMinusAe(allCompsT[0].x, [-3.2045, 6.3829695, -0.7227162, -2.455]));
530     assert(plusMinusAe(allCompsT[0].rotation, [0.3025, 0.05657, 0.25142, 0.91763]));
531 
532     assert(plusMinusAe(allCompsT[1].x, [-3.46136, -0.6365, 1.75111, 2.3468]));
533     assert(plusMinusAe(allCompsT[1].rotation,
534         [-0.06269096,  0.88643747, -0.4498119, 0.08926183]));
535 
536     assert(plusMinusAe(allCompsT[2].x,
537         [2.895362e-03,  3.201053e-01, -1.631345e+00,  1.308344e+00]));
538     assert(plusMinusAe(allCompsT[2].rotation,
539         [0.87140678, -0.14628160, -0.4409721, -0.15746595]));
540 
541     auto comp1T = firstComponent(m2, opts);
542     assert(plusMinusAe(comp1T.x, allCompsT[0].x));
543     assert(plusMinusAe(comp1T.rotation, allCompsT[0].rotation));
544 
545     // Test with scaling.
546     opts.unitVariance = true;
547     opts.transpose = false;
548     auto allCompsScale = firstNComponents(mat, 3, opts);
549     assert(plusMinusAe(allCompsScale[0].x,
550         [6.878307e-02, -1.791647e+00, -3.733826e-01,  2.096247e+00]));
551     assert(plusMinusAe(allCompsScale[0].rotation,
552         [-0.3903603,  0.5398265, -0.4767623, -0.5735014]));
553 
554     assert(plusMinusAe(allCompsScale[1].x,
555         [6.804833e-01, -9.412491e-01,  9.231432e-01, -6.623774e-01]));
556     assert(plusMinusAe(allCompsScale[1].rotation,
557         [0.7355678, -0.2849885, -0.5068900, -0.3475401]));
558 
559     assert(plusMinusAe(allCompsScale[2].x,
560         [9.618048e-01,  1.428492e-02, -8.120905e-01, -1.639992e-01]));
561     assert(plusMinusAe(allCompsScale[2].rotation,
562             [-0.4925027, -0.5721616, -0.5897120, 0.2869006]));
563 
564     auto comp1S = firstComponent(m2, opts);
565     assert(plusMinusAe(comp1S.x, allCompsScale[0].x));
566     assert(plusMinusAe(comp1S.rotation, allCompsScale[0].rotation));
567 
568     opts.transpose = true;
569     auto allTScale = firstNComponents(mat, 3, opts);
570 
571     assert(plusMinusAe(allTScale[0].x,
572         [-1.419319e-01,  2.141908e+00, -8.368606e-01, -1.163116e+00]));
573     assert(plusMinusAe(allTScale[0].rotation,
574         [0.5361711, -0.2270814,  0.5685768,  0.5810981]));
575 
576     assert(plusMinusAe(allTScale[1].x,
577         [-1.692899e+00,  4.929717e-01,  3.049089e-01,  8.950189e-01]));
578     assert(plusMinusAe(allTScale[1].rotation,
579         [0.3026505,  0.7906601, -0.3652524,  0.3871047]));
580 
581     assert(plusMinusAe(allTScale[2].x,
582         [ 2.035977e-01,  2.705193e-02, -9.113051e-01,  6.806556e-01]));
583     assert(plusMinusAe(allTScale[2].rotation,
584             [0.7333168, -0.3396207, -0.4837054, -0.3360555]));
585 
586     auto comp1ST = firstComponent(m2, opts);
587     assert(plusMinusAe(comp1ST.x, allTScale[0].x));
588     assert(plusMinusAe(comp1ST.rotation, allTScale[0].rotation));
589 
590     void compAll(PrincipalComponent[] lhs, PrincipalComponent[] rhs) {
591         assert(lhs.length == rhs.length);
592         foreach(i, elem; lhs) {
593             assert(plusMinusAe(elem.x, rhs[i].x));
594             assert(plusMinusAe(elem.rotation, rhs[i].rotation));
595         }
596     }
597 
598     opts.destructive = true;
599     auto allDestructive = firstNComponents(mat, 3, opts);
600     compAll(allTScale, allDestructive);
601     compAll([firstComponent(getMat(), opts)], allDestructive[0..1]);
602 
603     mat = getMat();
604     opts.transpose = false;
605     allDestructive = firstNComponents(mat, 3, opts);
606     compAll(allDestructive, allCompsScale);
607     compAll([firstComponent(getMat(), opts)], allDestructive[0..1]);
608 
609     mat = getMat();
610     opts.unitVariance = false;
611     allDestructive = firstNComponents(mat, 3, opts);
612     compAll(allDestructive, allComps);
613     compAll([firstComponent(getMat(), opts)], allDestructive[0..1]);
614 
615     mat = getMat();
616     opts.transpose = true;
617     allDestructive = firstNComponents(mat, 3, opts);
618     compAll(allDestructive, allCompsT);
619     compAll([firstComponent(getMat(), opts)], allDestructive[0..1]);
620 }