forked from shokru/mlfactor.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unsup.html
1232 lines (1168 loc) · 110 KB
/
unsup.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Chapter 15 Unsupervised learning | Machine Learning for Factor Investing</title>
<meta name="description" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
<meta name="generator" content="bookdown 0.21 and GitBook 2.6.7" />
<meta property="og:title" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
<meta name="author" content="Guillaume Coqueret and Tony Guida" />
<meta name="date" content="2021-01-08" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<link rel="prev" href="causality.html"/>
<link rel="next" href="RL.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<script src="libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0/anchor-sections.js"></script>
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<link href="libs/lightable-0.0.1/lightable.css" rel="stylesheet" />
<style type="text/css">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html"><i class="fa fa-check"></i>Preface</a><ul>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#what-this-book-is-not-about"><i class="fa fa-check"></i>What this book is not about</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#the-targeted-audience"><i class="fa fa-check"></i>The targeted audience</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#how-this-book-is-structured"><i class="fa fa-check"></i>How this book is structured</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#companion-website"><i class="fa fa-check"></i>Companion website</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#why-r"><i class="fa fa-check"></i>Why R?</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#coding-instructions"><i class="fa fa-check"></i>Coding instructions</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#acknowledgments"><i class="fa fa-check"></i>Acknowledgments</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#future-developments"><i class="fa fa-check"></i>Future developments</a></li>
</ul></li>
<li class="part"><span><b>I Introduction</b></span></li>
<li class="chapter" data-level="1" data-path="notdata.html"><a href="notdata.html"><i class="fa fa-check"></i><b>1</b> Notations and data</a><ul>
<li class="chapter" data-level="1.1" data-path="notdata.html"><a href="notdata.html#notations"><i class="fa fa-check"></i><b>1.1</b> Notations</a></li>
<li class="chapter" data-level="1.2" data-path="notdata.html"><a href="notdata.html#dataset"><i class="fa fa-check"></i><b>1.2</b> Dataset</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
<li class="chapter" data-level="2.1" data-path="intro.html"><a href="intro.html#context"><i class="fa fa-check"></i><b>2.1</b> Context</a></li>
<li class="chapter" data-level="2.2" data-path="intro.html"><a href="intro.html#portfolio-construction-the-workflow"><i class="fa fa-check"></i><b>2.2</b> Portfolio construction: the workflow</a></li>
<li class="chapter" data-level="2.3" data-path="intro.html"><a href="intro.html#machine-learning-is-no-magic-wand"><i class="fa fa-check"></i><b>2.3</b> Machine learning is no magic wand</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="factor.html"><a href="factor.html"><i class="fa fa-check"></i><b>3</b> Factor investing and asset pricing anomalies</a><ul>
<li class="chapter" data-level="3.1" data-path="factor.html"><a href="factor.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
<li class="chapter" data-level="3.2" data-path="factor.html"><a href="factor.html#detecting-anomalies"><i class="fa fa-check"></i><b>3.2</b> Detecting anomalies</a><ul>
<li class="chapter" data-level="3.2.1" data-path="factor.html"><a href="factor.html#challenges"><i class="fa fa-check"></i><b>3.2.1</b> Challenges</a></li>
<li class="chapter" data-level="3.2.2" data-path="factor.html"><a href="factor.html#simple-portfolio-sorts"><i class="fa fa-check"></i><b>3.2.2</b> Simple portfolio sorts </a></li>
<li class="chapter" data-level="3.2.3" data-path="factor.html"><a href="factor.html#factors"><i class="fa fa-check"></i><b>3.2.3</b> Factors</a></li>
<li class="chapter" data-level="3.2.4" data-path="factor.html"><a href="factor.html#predictive-regressions-sorts-and-p-value-issues"><i class="fa fa-check"></i><b>3.2.4</b> Predictive regressions, sorts, and p-value issues</a></li>
<li class="chapter" data-level="3.2.5" data-path="factor.html"><a href="factor.html#fama-macbeth-regressions"><i class="fa fa-check"></i><b>3.2.5</b> Fama-Macbeth regressions</a></li>
<li class="chapter" data-level="3.2.6" data-path="factor.html"><a href="factor.html#factor-competition"><i class="fa fa-check"></i><b>3.2.6</b> Factor competition</a></li>
<li class="chapter" data-level="3.2.7" data-path="factor.html"><a href="factor.html#advanced-techniques"><i class="fa fa-check"></i><b>3.2.7</b> Advanced techniques</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="factor.html"><a href="factor.html#factors-or-characteristics"><i class="fa fa-check"></i><b>3.3</b> Factors or characteristics?</a></li>
<li class="chapter" data-level="3.4" data-path="factor.html"><a href="factor.html#hot-topics-momentum-timing-and-esg"><i class="fa fa-check"></i><b>3.4</b> Hot topics: momentum, timing and ESG</a><ul>
<li class="chapter" data-level="3.4.1" data-path="factor.html"><a href="factor.html#factor-momentum"><i class="fa fa-check"></i><b>3.4.1</b> Factor momentum</a></li>
<li class="chapter" data-level="3.4.2" data-path="factor.html"><a href="factor.html#factor-timing"><i class="fa fa-check"></i><b>3.4.2</b> Factor timing</a></li>
<li class="chapter" data-level="3.4.3" data-path="factor.html"><a href="factor.html#the-green-factors"><i class="fa fa-check"></i><b>3.4.3</b> The green factors</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="factor.html"><a href="factor.html#the-links-with-machine-learning"><i class="fa fa-check"></i><b>3.5</b> The links with machine learning</a><ul>
<li class="chapter" data-level="3.5.1" data-path="factor.html"><a href="factor.html#a-short-list-of-recent-references"><i class="fa fa-check"></i><b>3.5.1</b> A short list of recent references</a></li>
<li class="chapter" data-level="3.5.2" data-path="factor.html"><a href="factor.html#explicit-connections-with-asset-pricing-models"><i class="fa fa-check"></i><b>3.5.2</b> Explicit connections with asset pricing models</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="factor.html"><a href="factor.html#coding-exercises"><i class="fa fa-check"></i><b>3.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="Data.html"><a href="Data.html"><i class="fa fa-check"></i><b>4</b> Data preprocessing</a><ul>
<li class="chapter" data-level="4.1" data-path="Data.html"><a href="Data.html#know-your-data"><i class="fa fa-check"></i><b>4.1</b> Know your data</a></li>
<li class="chapter" data-level="4.2" data-path="Data.html"><a href="Data.html#missing-data"><i class="fa fa-check"></i><b>4.2</b> Missing data</a></li>
<li class="chapter" data-level="4.3" data-path="Data.html"><a href="Data.html#outlier-detection"><i class="fa fa-check"></i><b>4.3</b> Outlier detection</a></li>
<li class="chapter" data-level="4.4" data-path="Data.html"><a href="Data.html#feateng"><i class="fa fa-check"></i><b>4.4</b> Feature engineering</a><ul>
<li class="chapter" data-level="4.4.1" data-path="Data.html"><a href="Data.html#feature-selection"><i class="fa fa-check"></i><b>4.4.1</b> Feature selection</a></li>
<li class="chapter" data-level="4.4.2" data-path="Data.html"><a href="Data.html#scaling"><i class="fa fa-check"></i><b>4.4.2</b> Scaling the predictors</a></li>
</ul></li>
<li class="chapter" data-level="4.5" data-path="Data.html"><a href="Data.html#labelling"><i class="fa fa-check"></i><b>4.5</b> Labelling</a><ul>
<li class="chapter" data-level="4.5.1" data-path="Data.html"><a href="Data.html#simple-labels"><i class="fa fa-check"></i><b>4.5.1</b> Simple labels</a></li>
<li class="chapter" data-level="4.5.2" data-path="Data.html"><a href="Data.html#categorical-labels"><i class="fa fa-check"></i><b>4.5.2</b> Categorical labels</a></li>
<li class="chapter" data-level="4.5.3" data-path="Data.html"><a href="Data.html#the-triple-barrier-method"><i class="fa fa-check"></i><b>4.5.3</b> The triple barrier method</a></li>
<li class="chapter" data-level="4.5.4" data-path="Data.html"><a href="Data.html#filtering-the-sample"><i class="fa fa-check"></i><b>4.5.4</b> Filtering the sample</a></li>
<li class="chapter" data-level="4.5.5" data-path="Data.html"><a href="Data.html#horizons"><i class="fa fa-check"></i><b>4.5.5</b> Return horizons</a></li>
</ul></li>
<li class="chapter" data-level="4.6" data-path="Data.html"><a href="Data.html#pers"><i class="fa fa-check"></i><b>4.6</b> Handling persistence</a></li>
<li class="chapter" data-level="4.7" data-path="Data.html"><a href="Data.html#extensions"><i class="fa fa-check"></i><b>4.7</b> Extensions</a><ul>
<li class="chapter" data-level="4.7.1" data-path="Data.html"><a href="Data.html#transforming-features"><i class="fa fa-check"></i><b>4.7.1</b> Transforming features</a></li>
<li class="chapter" data-level="4.7.2" data-path="Data.html"><a href="Data.html#macrovar"><i class="fa fa-check"></i><b>4.7.2</b> Macro-economic variables</a></li>
<li class="chapter" data-level="4.7.3" data-path="Data.html"><a href="Data.html#active-learning"><i class="fa fa-check"></i><b>4.7.3</b> Active learning</a></li>
</ul></li>
<li class="chapter" data-level="4.8" data-path="Data.html"><a href="Data.html#additional-code-and-results"><i class="fa fa-check"></i><b>4.8</b> Additional code and results</a><ul>
<li class="chapter" data-level="4.8.1" data-path="Data.html"><a href="Data.html#impact-of-rescaling-graphical-representation"><i class="fa fa-check"></i><b>4.8.1</b> Impact of rescaling: graphical representation</a></li>
<li class="chapter" data-level="4.8.2" data-path="Data.html"><a href="Data.html#impact-of-rescaling-toy-example"><i class="fa fa-check"></i><b>4.8.2</b> Impact of rescaling: toy example</a></li>
</ul></li>
<li class="chapter" data-level="4.9" data-path="Data.html"><a href="Data.html#coding-exercises-1"><i class="fa fa-check"></i><b>4.9</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>II Common supervised algorithms</b></span></li>
<li class="chapter" data-level="5" data-path="lasso.html"><a href="lasso.html"><i class="fa fa-check"></i><b>5</b> Penalized regressions and sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.1" data-path="lasso.html"><a href="lasso.html#penalized-regressions"><i class="fa fa-check"></i><b>5.1</b> Penalized regressions</a><ul>
<li class="chapter" data-level="5.1.1" data-path="lasso.html"><a href="lasso.html#penreg"><i class="fa fa-check"></i><b>5.1.1</b> Simple regressions</a></li>
<li class="chapter" data-level="5.1.2" data-path="lasso.html"><a href="lasso.html#forms-of-penalizations"><i class="fa fa-check"></i><b>5.1.2</b> Forms of penalizations</a></li>
<li class="chapter" data-level="5.1.3" data-path="lasso.html"><a href="lasso.html#illustrations"><i class="fa fa-check"></i><b>5.1.3</b> Illustrations</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="lasso.html"><a href="lasso.html#sparse-hedging-for-minimum-variance-portfolios"><i class="fa fa-check"></i><b>5.2</b> Sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.2.1" data-path="lasso.html"><a href="lasso.html#presentation-and-derivations"><i class="fa fa-check"></i><b>5.2.1</b> Presentation and derivations</a></li>
<li class="chapter" data-level="5.2.2" data-path="lasso.html"><a href="lasso.html#sparseex"><i class="fa fa-check"></i><b>5.2.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="lasso.html"><a href="lasso.html#predictive-regressions"><i class="fa fa-check"></i><b>5.3</b> Predictive regressions</a><ul>
<li class="chapter" data-level="5.3.1" data-path="lasso.html"><a href="lasso.html#literature-review-and-principle"><i class="fa fa-check"></i><b>5.3.1</b> Literature review and principle</a></li>
<li class="chapter" data-level="5.3.2" data-path="lasso.html"><a href="lasso.html#code-and-results"><i class="fa fa-check"></i><b>5.3.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="lasso.html"><a href="lasso.html#coding-exercise"><i class="fa fa-check"></i><b>5.4</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="trees.html"><a href="trees.html"><i class="fa fa-check"></i><b>6</b> Tree-based methods</a><ul>
<li class="chapter" data-level="6.1" data-path="trees.html"><a href="trees.html#simple-trees"><i class="fa fa-check"></i><b>6.1</b> Simple trees</a><ul>
<li class="chapter" data-level="6.1.1" data-path="trees.html"><a href="trees.html#principle"><i class="fa fa-check"></i><b>6.1.1</b> Principle</a></li>
<li class="chapter" data-level="6.1.2" data-path="trees.html"><a href="trees.html#treeclass"><i class="fa fa-check"></i><b>6.1.2</b> Further details on classification</a></li>
<li class="chapter" data-level="6.1.3" data-path="trees.html"><a href="trees.html#pruning-criteria"><i class="fa fa-check"></i><b>6.1.3</b> Pruning criteria</a></li>
<li class="chapter" data-level="6.1.4" data-path="trees.html"><a href="trees.html#code-and-interpretation"><i class="fa fa-check"></i><b>6.1.4</b> Code and interpretation</a></li>
</ul></li>
<li class="chapter" data-level="6.2" data-path="trees.html"><a href="trees.html#random-forests"><i class="fa fa-check"></i><b>6.2</b> Random forests</a><ul>
<li class="chapter" data-level="6.2.1" data-path="trees.html"><a href="trees.html#principle-1"><i class="fa fa-check"></i><b>6.2.1</b> Principle</a></li>
<li class="chapter" data-level="6.2.2" data-path="trees.html"><a href="trees.html#code-and-results-1"><i class="fa fa-check"></i><b>6.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="6.3" data-path="trees.html"><a href="trees.html#adaboost"><i class="fa fa-check"></i><b>6.3</b> Boosted trees: Adaboost</a><ul>
<li class="chapter" data-level="6.3.1" data-path="trees.html"><a href="trees.html#methodology"><i class="fa fa-check"></i><b>6.3.1</b> Methodology</a></li>
<li class="chapter" data-level="6.3.2" data-path="trees.html"><a href="trees.html#illustration"><i class="fa fa-check"></i><b>6.3.2</b> Illustration</a></li>
</ul></li>
<li class="chapter" data-level="6.4" data-path="trees.html"><a href="trees.html#boosted-trees-extreme-gradient-boosting"><i class="fa fa-check"></i><b>6.4</b> Boosted trees: extreme gradient boosting</a><ul>
<li class="chapter" data-level="6.4.1" data-path="trees.html"><a href="trees.html#managing-loss"><i class="fa fa-check"></i><b>6.4.1</b> Managing loss</a></li>
<li class="chapter" data-level="6.4.2" data-path="trees.html"><a href="trees.html#penalization"><i class="fa fa-check"></i><b>6.4.2</b> Penalization</a></li>
<li class="chapter" data-level="6.4.3" data-path="trees.html"><a href="trees.html#aggregation"><i class="fa fa-check"></i><b>6.4.3</b> Aggregation</a></li>
<li class="chapter" data-level="6.4.4" data-path="trees.html"><a href="trees.html#tree-structure"><i class="fa fa-check"></i><b>6.4.4</b> Tree structure</a></li>
<li class="chapter" data-level="6.4.5" data-path="trees.html"><a href="trees.html#boostext"><i class="fa fa-check"></i><b>6.4.5</b> Extensions</a></li>
<li class="chapter" data-level="6.4.6" data-path="trees.html"><a href="trees.html#boostcode"><i class="fa fa-check"></i><b>6.4.6</b> Code and results</a></li>
<li class="chapter" data-level="6.4.7" data-path="trees.html"><a href="trees.html#instweight"><i class="fa fa-check"></i><b>6.4.7</b> Instance weighting</a></li>
</ul></li>
<li class="chapter" data-level="6.5" data-path="trees.html"><a href="trees.html#discussion"><i class="fa fa-check"></i><b>6.5</b> Discussion</a></li>
<li class="chapter" data-level="6.6" data-path="trees.html"><a href="trees.html#coding-exercises-2"><i class="fa fa-check"></i><b>6.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="NN.html"><a href="NN.html"><i class="fa fa-check"></i><b>7</b> Neural networks</a><ul>
<li class="chapter" data-level="7.1" data-path="NN.html"><a href="NN.html#the-original-perceptron"><i class="fa fa-check"></i><b>7.1</b> The original perceptron</a></li>
<li class="chapter" data-level="7.2" data-path="NN.html"><a href="NN.html#multilayer-perceptron"><i class="fa fa-check"></i><b>7.2</b> Multilayer perceptron</a><ul>
<li class="chapter" data-level="7.2.1" data-path="NN.html"><a href="NN.html#introduction-and-notations"><i class="fa fa-check"></i><b>7.2.1</b> Introduction and notations</a></li>
<li class="chapter" data-level="7.2.2" data-path="NN.html"><a href="NN.html#universal-approximation"><i class="fa fa-check"></i><b>7.2.2</b> Universal approximation</a></li>
<li class="chapter" data-level="7.2.3" data-path="NN.html"><a href="NN.html#backprop"><i class="fa fa-check"></i><b>7.2.3</b> Learning via back-propagation</a></li>
<li class="chapter" data-level="7.2.4" data-path="NN.html"><a href="NN.html#further-details-on-classification"><i class="fa fa-check"></i><b>7.2.4</b> Further details on classification</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="NN.html"><a href="NN.html#howdeep"><i class="fa fa-check"></i><b>7.3</b> How deep we should go and other practical issues</a><ul>
<li class="chapter" data-level="7.3.1" data-path="NN.html"><a href="NN.html#architectural-choices"><i class="fa fa-check"></i><b>7.3.1</b> Architectural choices</a></li>
<li class="chapter" data-level="7.3.2" data-path="NN.html"><a href="NN.html#frequency-of-weight-updates-and-learning-duration"><i class="fa fa-check"></i><b>7.3.2</b> Frequency of weight updates and learning duration</a></li>
<li class="chapter" data-level="7.3.3" data-path="NN.html"><a href="NN.html#penalizations-and-dropout"><i class="fa fa-check"></i><b>7.3.3</b> Penalizations and dropout</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="NN.html"><a href="NN.html#code-samples-and-comments-for-vanilla-mlp"><i class="fa fa-check"></i><b>7.4</b> Code samples and comments for vanilla MLP</a><ul>
<li class="chapter" data-level="7.4.1" data-path="NN.html"><a href="NN.html#regression-example"><i class="fa fa-check"></i><b>7.4.1</b> Regression example</a></li>
<li class="chapter" data-level="7.4.2" data-path="NN.html"><a href="NN.html#classification-example"><i class="fa fa-check"></i><b>7.4.2</b> Classification example</a></li>
<li class="chapter" data-level="7.4.3" data-path="NN.html"><a href="NN.html#custloss"><i class="fa fa-check"></i><b>7.4.3</b> Custom losses</a></li>
</ul></li>
<li class="chapter" data-level="7.5" data-path="NN.html"><a href="NN.html#recurrent-networks"><i class="fa fa-check"></i><b>7.5</b> Recurrent networks</a><ul>
<li class="chapter" data-level="7.5.1" data-path="NN.html"><a href="NN.html#presentation"><i class="fa fa-check"></i><b>7.5.1</b> Presentation</a></li>
<li class="chapter" data-level="7.5.2" data-path="NN.html"><a href="NN.html#code-and-results-2"><i class="fa fa-check"></i><b>7.5.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="NN.html"><a href="NN.html#other-common-architectures"><i class="fa fa-check"></i><b>7.6</b> Other common architectures</a><ul>
<li class="chapter" data-level="7.6.1" data-path="NN.html"><a href="NN.html#generative-aversarial-networks"><i class="fa fa-check"></i><b>7.6.1</b> Generative adversarial networks</a></li>
<li class="chapter" data-level="7.6.2" data-path="NN.html"><a href="NN.html#autoencoders"><i class="fa fa-check"></i><b>7.6.2</b> Autoencoders</a></li>
<li class="chapter" data-level="7.6.3" data-path="NN.html"><a href="NN.html#a-word-on-convolutional-networks"><i class="fa fa-check"></i><b>7.6.3</b> A word on convolutional networks</a></li>
<li class="chapter" data-level="7.6.4" data-path="NN.html"><a href="NN.html#advanced-architectures"><i class="fa fa-check"></i><b>7.6.4</b> Advanced architectures</a></li>
</ul></li>
<li class="chapter" data-level="7.7" data-path="NN.html"><a href="NN.html#coding-exercise-1"><i class="fa fa-check"></i><b>7.7</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>8</b> Support vector machines</a><ul>
<li class="chapter" data-level="8.1" data-path="svm.html"><a href="svm.html#svm-for-classification"><i class="fa fa-check"></i><b>8.1</b> SVM for classification</a></li>
<li class="chapter" data-level="8.2" data-path="svm.html"><a href="svm.html#svm-for-regression"><i class="fa fa-check"></i><b>8.2</b> SVM for regression</a></li>
<li class="chapter" data-level="8.3" data-path="svm.html"><a href="svm.html#practice"><i class="fa fa-check"></i><b>8.3</b> Practice</a></li>
<li class="chapter" data-level="8.4" data-path="svm.html"><a href="svm.html#coding-exercises-3"><i class="fa fa-check"></i><b>8.4</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="bayes.html"><a href="bayes.html"><i class="fa fa-check"></i><b>9</b> Bayesian methods</a><ul>
<li class="chapter" data-level="9.1" data-path="bayes.html"><a href="bayes.html#the-bayesian-framework"><i class="fa fa-check"></i><b>9.1</b> The Bayesian framework</a></li>
<li class="chapter" data-level="9.2" data-path="bayes.html"><a href="bayes.html#bayesian-sampling"><i class="fa fa-check"></i><b>9.2</b> Bayesian sampling</a><ul>
<li class="chapter" data-level="9.2.1" data-path="bayes.html"><a href="bayes.html#gibbs-sampling"><i class="fa fa-check"></i><b>9.2.1</b> Gibbs sampling</a></li>
<li class="chapter" data-level="9.2.2" data-path="bayes.html"><a href="bayes.html#metropolis-hastings-sampling"><i class="fa fa-check"></i><b>9.2.2</b> Metropolis-Hastings sampling</a></li>
</ul></li>
<li class="chapter" data-level="9.3" data-path="bayes.html"><a href="bayes.html#bayesian-linear-regression"><i class="fa fa-check"></i><b>9.3</b> Bayesian linear regression</a></li>
<li class="chapter" data-level="9.4" data-path="bayes.html"><a href="bayes.html#naive-bayes-classifier"><i class="fa fa-check"></i><b>9.4</b> Naive Bayes classifier</a></li>
<li class="chapter" data-level="9.5" data-path="bayes.html"><a href="bayes.html#BART"><i class="fa fa-check"></i><b>9.5</b> Bayesian additive trees</a><ul>
<li class="chapter" data-level="9.5.1" data-path="bayes.html"><a href="bayes.html#general-formulation"><i class="fa fa-check"></i><b>9.5.1</b> General formulation</a></li>
<li class="chapter" data-level="9.5.2" data-path="bayes.html"><a href="bayes.html#priors"><i class="fa fa-check"></i><b>9.5.2</b> Priors</a></li>
<li class="chapter" data-level="9.5.3" data-path="bayes.html"><a href="bayes.html#sampling-and-predictions"><i class="fa fa-check"></i><b>9.5.3</b> Sampling and predictions</a></li>
<li class="chapter" data-level="9.5.4" data-path="bayes.html"><a href="bayes.html#code"><i class="fa fa-check"></i><b>9.5.4</b> Code</a></li>
</ul></li>
</ul></li>
<li class="part"><span><b>III From predictions to portfolios</b></span></li>
<li class="chapter" data-level="10" data-path="valtune.html"><a href="valtune.html"><i class="fa fa-check"></i><b>10</b> Validating and tuning</a><ul>
<li class="chapter" data-level="10.1" data-path="valtune.html"><a href="valtune.html#mlmetrics"><i class="fa fa-check"></i><b>10.1</b> Learning metrics</a><ul>
<li class="chapter" data-level="10.1.1" data-path="valtune.html"><a href="valtune.html#regression-analysis"><i class="fa fa-check"></i><b>10.1.1</b> Regression analysis</a></li>
<li class="chapter" data-level="10.1.2" data-path="valtune.html"><a href="valtune.html#classification-analysis"><i class="fa fa-check"></i><b>10.1.2</b> Classification analysis</a></li>
</ul></li>
<li class="chapter" data-level="10.2" data-path="valtune.html"><a href="valtune.html#validation"><i class="fa fa-check"></i><b>10.2</b> Validation</a><ul>
<li class="chapter" data-level="10.2.1" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-theory"><i class="fa fa-check"></i><b>10.2.1</b> The variance-bias tradeoff: theory</a></li>
<li class="chapter" data-level="10.2.2" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-illustration"><i class="fa fa-check"></i><b>10.2.2</b> The variance-bias tradeoff: illustration</a></li>
<li class="chapter" data-level="10.2.3" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-principle"><i class="fa fa-check"></i><b>10.2.3</b> The risk of overfitting: principle</a></li>
<li class="chapter" data-level="10.2.4" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-some-solutions"><i class="fa fa-check"></i><b>10.2.4</b> The risk of overfitting: some solutions</a></li>
</ul></li>
<li class="chapter" data-level="10.3" data-path="valtune.html"><a href="valtune.html#the-search-for-good-hyperparameters"><i class="fa fa-check"></i><b>10.3</b> The search for good hyperparameters</a><ul>
<li class="chapter" data-level="10.3.1" data-path="valtune.html"><a href="valtune.html#methods"><i class="fa fa-check"></i><b>10.3.1</b> Methods</a></li>
<li class="chapter" data-level="10.3.2" data-path="valtune.html"><a href="valtune.html#example-grid-search"><i class="fa fa-check"></i><b>10.3.2</b> Example: grid search</a></li>
<li class="chapter" data-level="10.3.3" data-path="valtune.html"><a href="valtune.html#example-bayesian-optimization"><i class="fa fa-check"></i><b>10.3.3</b> Example: Bayesian optimization</a></li>
</ul></li>
<li class="chapter" data-level="10.4" data-path="valtune.html"><a href="valtune.html#short-discussion-on-validation-in-backtests"><i class="fa fa-check"></i><b>10.4</b> Short discussion on validation in backtests</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="ensemble.html"><a href="ensemble.html"><i class="fa fa-check"></i><b>11</b> Ensemble models</a><ul>
<li class="chapter" data-level="11.1" data-path="ensemble.html"><a href="ensemble.html#linear-ensembles"><i class="fa fa-check"></i><b>11.1</b> Linear ensembles</a><ul>
<li class="chapter" data-level="11.1.1" data-path="ensemble.html"><a href="ensemble.html#principles"><i class="fa fa-check"></i><b>11.1.1</b> Principles</a></li>
<li class="chapter" data-level="11.1.2" data-path="ensemble.html"><a href="ensemble.html#example"><i class="fa fa-check"></i><b>11.1.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="11.2" data-path="ensemble.html"><a href="ensemble.html#stacked-ensembles"><i class="fa fa-check"></i><b>11.2</b> Stacked ensembles</a><ul>
<li class="chapter" data-level="11.2.1" data-path="ensemble.html"><a href="ensemble.html#two-stage-training"><i class="fa fa-check"></i><b>11.2.1</b> Two-stage training</a></li>
<li class="chapter" data-level="11.2.2" data-path="ensemble.html"><a href="ensemble.html#code-and-results-3"><i class="fa fa-check"></i><b>11.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="11.3" data-path="ensemble.html"><a href="ensemble.html#extensions-1"><i class="fa fa-check"></i><b>11.3</b> Extensions</a><ul>
<li class="chapter" data-level="11.3.1" data-path="ensemble.html"><a href="ensemble.html#exogenous-variables"><i class="fa fa-check"></i><b>11.3.1</b> Exogenous variables</a></li>
<li class="chapter" data-level="11.3.2" data-path="ensemble.html"><a href="ensemble.html#shrinking-inter-model-correlations"><i class="fa fa-check"></i><b>11.3.2</b> Shrinking inter-model correlations</a></li>
</ul></li>
<li class="chapter" data-level="11.4" data-path="ensemble.html"><a href="ensemble.html#exercise"><i class="fa fa-check"></i><b>11.4</b> Exercise</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="backtest.html"><a href="backtest.html"><i class="fa fa-check"></i><b>12</b> Portfolio backtesting</a><ul>
<li class="chapter" data-level="12.1" data-path="backtest.html"><a href="backtest.html#protocol"><i class="fa fa-check"></i><b>12.1</b> Setting the protocol</a></li>
<li class="chapter" data-level="12.2" data-path="backtest.html"><a href="backtest.html#turning-signals-into-portfolio-weights"><i class="fa fa-check"></i><b>12.2</b> Turning signals into portfolio weights</a></li>
<li class="chapter" data-level="12.3" data-path="backtest.html"><a href="backtest.html#perfmet"><i class="fa fa-check"></i><b>12.3</b> Performance metrics</a><ul>
<li class="chapter" data-level="12.3.1" data-path="backtest.html"><a href="backtest.html#discussion-1"><i class="fa fa-check"></i><b>12.3.1</b> Discussion</a></li>
<li class="chapter" data-level="12.3.2" data-path="backtest.html"><a href="backtest.html#pure-performance-and-risk-indicators"><i class="fa fa-check"></i><b>12.3.2</b> Pure performance and risk indicators</a></li>
<li class="chapter" data-level="12.3.3" data-path="backtest.html"><a href="backtest.html#factor-based-evaluation"><i class="fa fa-check"></i><b>12.3.3</b> Factor-based evaluation</a></li>
<li class="chapter" data-level="12.3.4" data-path="backtest.html"><a href="backtest.html#risk-adjusted-measures"><i class="fa fa-check"></i><b>12.3.4</b> Risk-adjusted measures</a></li>
<li class="chapter" data-level="12.3.5" data-path="backtest.html"><a href="backtest.html#transaction-costs-and-turnover"><i class="fa fa-check"></i><b>12.3.5</b> Transaction costs and turnover</a></li>
</ul></li>
<li class="chapter" data-level="12.4" data-path="backtest.html"><a href="backtest.html#common-errors-and-issues"><i class="fa fa-check"></i><b>12.4</b> Common errors and issues</a><ul>
<li class="chapter" data-level="12.4.1" data-path="backtest.html"><a href="backtest.html#forward-looking-data"><i class="fa fa-check"></i><b>12.4.1</b> Forward looking data</a></li>
<li class="chapter" data-level="12.4.2" data-path="backtest.html"><a href="backtest.html#backov"><i class="fa fa-check"></i><b>12.4.2</b> Backtest overfitting</a></li>
<li class="chapter" data-level="12.4.3" data-path="backtest.html"><a href="backtest.html#simple-safeguards"><i class="fa fa-check"></i><b>12.4.3</b> Simple safeguards</a></li>
</ul></li>
<li class="chapter" data-level="12.5" data-path="backtest.html"><a href="backtest.html#implication-of-non-stationarity-forecasting-is-hard"><i class="fa fa-check"></i><b>12.5</b> Implication of non-stationarity: forecasting is hard</a><ul>
<li class="chapter" data-level="12.5.1" data-path="backtest.html"><a href="backtest.html#general-comments"><i class="fa fa-check"></i><b>12.5.1</b> General comments</a></li>
<li class="chapter" data-level="12.5.2" data-path="backtest.html"><a href="backtest.html#the-no-free-lunch-theorem"><i class="fa fa-check"></i><b>12.5.2</b> The no free lunch theorem</a></li>
</ul></li>
<li class="chapter" data-level="12.6" data-path="backtest.html"><a href="backtest.html#first-example-a-complete-backtest"><i class="fa fa-check"></i><b>12.6</b> First example: a complete backtest</a></li>
<li class="chapter" data-level="12.7" data-path="backtest.html"><a href="backtest.html#second-example-backtest-overfitting"><i class="fa fa-check"></i><b>12.7</b> Second example: backtest overfitting</a></li>
<li class="chapter" data-level="12.8" data-path="backtest.html"><a href="backtest.html#coding-exercises-4"><i class="fa fa-check"></i><b>12.8</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>IV Further important topics</b></span></li>
<li class="chapter" data-level="13" data-path="interp.html"><a href="interp.html"><i class="fa fa-check"></i><b>13</b> Interpretability</a><ul>
<li class="chapter" data-level="13.1" data-path="interp.html"><a href="interp.html#global-interpretations"><i class="fa fa-check"></i><b>13.1</b> Global interpretations</a><ul>
<li class="chapter" data-level="13.1.1" data-path="interp.html"><a href="interp.html#surr"><i class="fa fa-check"></i><b>13.1.1</b> Simple models as surrogates</a></li>
<li class="chapter" data-level="13.1.2" data-path="interp.html"><a href="interp.html#variable-importance"><i class="fa fa-check"></i><b>13.1.2</b> Variable importance (tree-based)</a></li>
<li class="chapter" data-level="13.1.3" data-path="interp.html"><a href="interp.html#variable-importance-agnostic"><i class="fa fa-check"></i><b>13.1.3</b> Variable importance (agnostic)</a></li>
<li class="chapter" data-level="13.1.4" data-path="interp.html"><a href="interp.html#partial-dependence-plot"><i class="fa fa-check"></i><b>13.1.4</b> Partial dependence plot</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="interp.html"><a href="interp.html#local-interpretations"><i class="fa fa-check"></i><b>13.2</b> Local interpretations</a><ul>
<li class="chapter" data-level="13.2.1" data-path="interp.html"><a href="interp.html#lime"><i class="fa fa-check"></i><b>13.2.1</b> LIME</a></li>
<li class="chapter" data-level="13.2.2" data-path="interp.html"><a href="interp.html#shapley-values"><i class="fa fa-check"></i><b>13.2.2</b> Shapley values</a></li>
<li class="chapter" data-level="13.2.3" data-path="interp.html"><a href="interp.html#breakdown"><i class="fa fa-check"></i><b>13.2.3</b> Breakdown</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="14" data-path="causality.html"><a href="causality.html"><i class="fa fa-check"></i><b>14</b> Two key concepts: causality and non-stationarity</a><ul>
<li class="chapter" data-level="14.1" data-path="causality.html"><a href="causality.html#causality-1"><i class="fa fa-check"></i><b>14.1</b> Causality</a><ul>
<li class="chapter" data-level="14.1.1" data-path="causality.html"><a href="causality.html#granger"><i class="fa fa-check"></i><b>14.1.1</b> Granger causality</a></li>
<li class="chapter" data-level="14.1.2" data-path="causality.html"><a href="causality.html#causal-additive-models"><i class="fa fa-check"></i><b>14.1.2</b> Causal additive models</a></li>
<li class="chapter" data-level="14.1.3" data-path="causality.html"><a href="causality.html#structural-time-series-models"><i class="fa fa-check"></i><b>14.1.3</b> Structural time series models</a></li>
</ul></li>
<li class="chapter" data-level="14.2" data-path="causality.html"><a href="causality.html#nonstat"><i class="fa fa-check"></i><b>14.2</b> Dealing with changing environments</a><ul>
<li class="chapter" data-level="14.2.1" data-path="causality.html"><a href="causality.html#non-stationarity-yet-another-illustration"><i class="fa fa-check"></i><b>14.2.1</b> Non-stationarity: yet another illustration</a></li>
<li class="chapter" data-level="14.2.2" data-path="causality.html"><a href="causality.html#online-learning"><i class="fa fa-check"></i><b>14.2.2</b> Online learning</a></li>
<li class="chapter" data-level="14.2.3" data-path="causality.html"><a href="causality.html#homogeneous-transfer-learning"><i class="fa fa-check"></i><b>14.2.3</b> Homogeneous transfer learning</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="15" data-path="unsup.html"><a href="unsup.html"><i class="fa fa-check"></i><b>15</b> Unsupervised learning</a><ul>
<li class="chapter" data-level="15.1" data-path="unsup.html"><a href="unsup.html#corpred"><i class="fa fa-check"></i><b>15.1</b> The problem with correlated predictors</a></li>
<li class="chapter" data-level="15.2" data-path="unsup.html"><a href="unsup.html#principal-component-analysis-and-autoencoders"><i class="fa fa-check"></i><b>15.2</b> Principal component analysis and autoencoders</a><ul>
<li class="chapter" data-level="15.2.1" data-path="unsup.html"><a href="unsup.html#a-bit-of-algebra"><i class="fa fa-check"></i><b>15.2.1</b> A bit of algebra</a></li>
<li class="chapter" data-level="15.2.2" data-path="unsup.html"><a href="unsup.html#pca"><i class="fa fa-check"></i><b>15.2.2</b> PCA</a></li>
<li class="chapter" data-level="15.2.3" data-path="unsup.html"><a href="unsup.html#ae"><i class="fa fa-check"></i><b>15.2.3</b> Autoencoders</a></li>
<li class="chapter" data-level="15.2.4" data-path="unsup.html"><a href="unsup.html#application"><i class="fa fa-check"></i><b>15.2.4</b> Application</a></li>
</ul></li>
<li class="chapter" data-level="15.3" data-path="unsup.html"><a href="unsup.html#clustering-via-k-means"><i class="fa fa-check"></i><b>15.3</b> Clustering via k-means</a></li>
<li class="chapter" data-level="15.4" data-path="unsup.html"><a href="unsup.html#nearest-neighbors"><i class="fa fa-check"></i><b>15.4</b> Nearest neighbors</a></li>
<li class="chapter" data-level="15.5" data-path="unsup.html"><a href="unsup.html#coding-exercise-2"><i class="fa fa-check"></i><b>15.5</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="RL.html"><a href="RL.html"><i class="fa fa-check"></i><b>16</b> Reinforcement learning</a><ul>
<li class="chapter" data-level="16.1" data-path="RL.html"><a href="RL.html#theoretical-layout"><i class="fa fa-check"></i><b>16.1</b> Theoretical layout</a><ul>
<li class="chapter" data-level="16.1.1" data-path="RL.html"><a href="RL.html#general-framework"><i class="fa fa-check"></i><b>16.1.1</b> General framework</a></li>
<li class="chapter" data-level="16.1.2" data-path="RL.html"><a href="RL.html#q-learning"><i class="fa fa-check"></i><b>16.1.2</b> Q-learning</a></li>
<li class="chapter" data-level="16.1.3" data-path="RL.html"><a href="RL.html#sarsa"><i class="fa fa-check"></i><b>16.1.3</b> SARSA</a></li>
</ul></li>
<li class="chapter" data-level="16.2" data-path="RL.html"><a href="RL.html#the-curse-of-dimensionality"><i class="fa fa-check"></i><b>16.2</b> The curse of dimensionality</a></li>
<li class="chapter" data-level="16.3" data-path="RL.html"><a href="RL.html#policy-gradient"><i class="fa fa-check"></i><b>16.3</b> Policy gradient</a><ul>
<li class="chapter" data-level="16.3.1" data-path="RL.html"><a href="RL.html#principle-2"><i class="fa fa-check"></i><b>16.3.1</b> Principle</a></li>
<li class="chapter" data-level="16.3.2" data-path="RL.html"><a href="RL.html#extensions-2"><i class="fa fa-check"></i><b>16.3.2</b> Extensions</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="RL.html"><a href="RL.html#simple-examples"><i class="fa fa-check"></i><b>16.4</b> Simple examples</a><ul>
<li class="chapter" data-level="16.4.1" data-path="RL.html"><a href="RL.html#q-learning-with-simulations"><i class="fa fa-check"></i><b>16.4.1</b> Q-learning with simulations</a></li>
<li class="chapter" data-level="16.4.2" data-path="RL.html"><a href="RL.html#RLemp2"><i class="fa fa-check"></i><b>16.4.2</b> Q-learning with market data</a></li>
</ul></li>
<li class="chapter" data-level="16.5" data-path="RL.html"><a href="RL.html#concluding-remarks"><i class="fa fa-check"></i><b>16.5</b> Concluding remarks</a></li>
<li class="chapter" data-level="16.6" data-path="RL.html"><a href="RL.html#exercises"><i class="fa fa-check"></i><b>16.6</b> Exercises</a></li>
</ul></li>
<li class="part"><span><b>V Appendix</b></span></li>
<li class="chapter" data-level="17" data-path="data-description.html"><a href="data-description.html"><i class="fa fa-check"></i><b>17</b> Data description</a></li>
<li class="chapter" data-level="18" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html"><i class="fa fa-check"></i><b>18</b> Solutions to exercises</a><ul>
<li class="chapter" data-level="18.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-3"><i class="fa fa-check"></i><b>18.1</b> Chapter 3</a></li>
<li class="chapter" data-level="18.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-4"><i class="fa fa-check"></i><b>18.2</b> Chapter 4</a></li>
<li class="chapter" data-level="18.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-5"><i class="fa fa-check"></i><b>18.3</b> Chapter 5</a></li>
<li class="chapter" data-level="18.4" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-6"><i class="fa fa-check"></i><b>18.4</b> Chapter 6</a></li>
<li class="chapter" data-level="18.5" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-7-the-autoencoder-model"><i class="fa fa-check"></i><b>18.5</b> Chapter 7: the autoencoder model</a></li>
<li class="chapter" data-level="18.6" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-8"><i class="fa fa-check"></i><b>18.6</b> Chapter 8</a></li>
<li class="chapter" data-level="18.7" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-11-ensemble-neural-network"><i class="fa fa-check"></i><b>18.7</b> Chapter 11: ensemble neural network</a></li>
<li class="chapter" data-level="18.8" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-12"><i class="fa fa-check"></i><b>18.8</b> Chapter 12</a><ul>
<li class="chapter" data-level="18.8.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#ew-portfolios-with-the-tidyverse"><i class="fa fa-check"></i><b>18.8.1</b> EW portfolios with the tidyverse</a></li>
<li class="chapter" data-level="18.8.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#advanced-weighting-function"><i class="fa fa-check"></i><b>18.8.2</b> Advanced weighting function</a></li>
<li class="chapter" data-level="18.8.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#functional-programming-in-the-backtest"><i class="fa fa-check"></i><b>18.8.3</b> Functional programming in the backtest</a></li>
</ul></li>
<li class="chapter" data-level="18.9" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-15"><i class="fa fa-check"></i><b>18.9</b> Chapter 15</a></li>
<li class="chapter" data-level="18.10" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-16"><i class="fa fa-check"></i><b>18.10</b> Chapter 16</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning for Factor Investing</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="unsup" class="section level1">
<h1><span class="header-section-number">Chapter 15</span> Unsupervised learning</h1>
<p>All algorithms presented in Chapters <a href="lasso.html#lasso">5</a> to <a href="bayes.html#bayes">9</a> belong to the larger class of supervised learning tools. Such tools seek to unveil a mapping between predictors <span class="math inline">\(\textbf{X}\)</span> and a label <span class="math inline">\(\textbf{Z}\)</span>. The supervision comes from the fact that it is asked that the data tries to explain this particular variable <span class="math inline">\(\textbf{Z}\)</span>. Another important part of machine learning consists of unsupervised tasks, that is, when <span class="math inline">\(\textbf{Z}\)</span> is not specified and the algorithm tries to make sense of <span class="math inline">\(\textbf{X}\)</span> on its own. Often, relationships between the components of <span class="math inline">\(\textbf{X}\)</span> are identified. This field is much too vast to be summarized in one book, let alone one chapter. The purpose here is to briefly explain in what ways unsupervised learning can be used, especially in the data pre-processing phase.</p>
<div id="corpred" class="section level2">
<h2><span class="header-section-number">15.1</span> The problem with correlated predictors</h2>
<p>Often, it is tempting to supply all predictors to a ML-fueled predictive engine. That may not be a good idea when some predictors are highly correlated. To illustrate this, the simplest example is a regression on two variables with zero mean and covariance and precisions matrices:
<span class="math display">\[\boldsymbol{\Sigma}=\textbf{X}'\textbf{X}=\begin{bmatrix} 1 & \rho \\ \rho & 1 \end{bmatrix}, \quad \boldsymbol{\Sigma}^{-1}=\frac{1}{1-\rho^2}\begin{bmatrix} 1 & -\rho \\ -\rho & 1 \end{bmatrix}.\]</span>
When the covariance/correlation <span class="math inline">\(\rho\)</span> increase towards 1 (the two variables are co-linear), the scaling denominator in <span class="math inline">\(\boldsymbol{\Sigma}^{-1}\)</span> goes to zero and the formula <span class="math inline">\(\hat{\boldsymbol{\beta}}=\boldsymbol{\Sigma}^{-1}\textbf{X}'\textbf{Z}\)</span> implies that one coefficient will be highly positive and one highly negative. The regression creates a spurious arbitrage between the two variables. Of course, this is very inefficient and yields disastrous results out-of-sample.</p>
<p>We illustrate what happens when many variables are used in the regression below (Table <a href="unsup.html#tab:regbroom">15.1</a>). One elucidation of the aforementioned phenomenon comes from the variables Mkt_Cap_12M_Usd and Mkt_Cap_6M_Usd, which have a correlation of 99.6% in the training sample. Both are singled out as highly significant but their signs are contradictory. Moreover, the magnitude of their coefficients are very close (0.21 versus 0.18) so that their net effect cancels out. Naturally, providing the regression with only one of these two inputs would have been wiser.</p>
<div class="sourceCode" id="cb224"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb224-1"><a href="unsup.html#cb224-1"></a><span class="kw">library</span>(broom) <span class="co"># Package for clean regression output </span></span>
<span id="cb224-2"><a href="unsup.html#cb224-2"></a>training_sample <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb224-3"><a href="unsup.html#cb224-3"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(<span class="kw">c</span>(features, <span class="st">"R1M_Usd"</span>)) <span class="op">%>%</span><span class="st"> </span><span class="co"># List of variables</span></span>
<span id="cb224-4"><a href="unsup.html#cb224-4"></a><span class="st"> </span><span class="kw">lm</span>(R1M_Usd <span class="op">~</span><span class="st"> </span>. , <span class="dt">data =</span> .) <span class="op">%>%</span><span class="st"> </span><span class="co"># Model: predict R1M_Usd</span></span>
<span id="cb224-5"><a href="unsup.html#cb224-5"></a><span class="st"> </span><span class="kw">tidy</span>() <span class="op">%>%</span><span class="st"> </span><span class="co"># Put output in clean format</span></span>
<span id="cb224-6"><a href="unsup.html#cb224-6"></a><span class="st"> </span><span class="kw">filter</span>(<span class="kw">abs</span>(statistic) <span class="op">></span><span class="st"> </span><span class="dv">3</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># Keep significant predictors only</span></span>
<span id="cb224-7"><a href="unsup.html#cb224-7"></a><span class="st"> </span>knitr<span class="op">::</span><span class="kw">kable</span>(<span class="dt">booktabs =</span> <span class="ot">TRUE</span>,</span>
<span id="cb224-8"><a href="unsup.html#cb224-8"></a> <span class="dt">caption =</span> <span class="st">"Significant predictors in the training sample."</span>) </span></code></pre></div>
<table>
<caption>
<span id="tab:regbroom">TABLE 15.1: </span>Significant predictors in the training sample.
</caption>
<thead>
<tr>
<th style="text-align:left;">
term
</th>
<th style="text-align:right;">
estimate
</th>
<th style="text-align:right;">
std.error
</th>
<th style="text-align:right;">
statistic
</th>
<th style="text-align:right;">
p.value
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
(Intercept)
</td>
<td style="text-align:right;">
0.0405741
</td>
<td style="text-align:right;">
0.0053427
</td>
<td style="text-align:right;">
7.594323
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Ebitda_Margin
</td>
<td style="text-align:right;">
0.0132374
</td>
<td style="text-align:right;">
0.0034927
</td>
<td style="text-align:right;">
3.789999
</td>
<td style="text-align:right;">
0.0001507
</td>
</tr>
<tr>
<td style="text-align:left;">
Ev_Ebitda
</td>
<td style="text-align:right;">
0.0068144
</td>
<td style="text-align:right;">
0.0022563
</td>
<td style="text-align:right;">
3.020213
</td>
<td style="text-align:right;">
0.0025263
</td>
</tr>
<tr>
<td style="text-align:left;">
Fa_Ci
</td>
<td style="text-align:right;">
0.0072308
</td>
<td style="text-align:right;">
0.0023465
</td>
<td style="text-align:right;">
3.081471
</td>
<td style="text-align:right;">
0.0020601
</td>
</tr>
<tr>
<td style="text-align:left;">
Fcf_Bv
</td>
<td style="text-align:right;">
0.0250538
</td>
<td style="text-align:right;">
0.0051314
</td>
<td style="text-align:right;">
4.882465
</td>
<td style="text-align:right;">
0.0000010
</td>
</tr>
<tr>
<td style="text-align:left;">
Fcf_Yld
</td>
<td style="text-align:right;">
-0.0158930
</td>
<td style="text-align:right;">
0.0037359
</td>
<td style="text-align:right;">
-4.254126
</td>
<td style="text-align:right;">
0.0000210
</td>
</tr>
<tr>
<td style="text-align:left;">
Mkt_Cap_12M_Usd
</td>
<td style="text-align:right;">
0.2047383
</td>
<td style="text-align:right;">
0.0274320
</td>
<td style="text-align:right;">
7.463476
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Mkt_Cap_6M_Usd
</td>
<td style="text-align:right;">
-0.1797795
</td>
<td style="text-align:right;">
0.0459390
</td>
<td style="text-align:right;">
-3.913443
</td>
<td style="text-align:right;">
0.0000910
</td>
</tr>
<tr>
<td style="text-align:left;">
Mom_5M_Usd
</td>
<td style="text-align:right;">
-0.0186690
</td>
<td style="text-align:right;">
0.0044313
</td>
<td style="text-align:right;">
-4.212972
</td>
<td style="text-align:right;">
0.0000252
</td>
</tr>
<tr>
<td style="text-align:left;">
Mom_Sharp_11M_Usd
</td>
<td style="text-align:right;">
0.0178174
</td>
<td style="text-align:right;">
0.0046948
</td>
<td style="text-align:right;">
3.795131
</td>
<td style="text-align:right;">
0.0001476
</td>
</tr>
<tr>
<td style="text-align:left;">
Ni
</td>
<td style="text-align:right;">
0.0154609
</td>
<td style="text-align:right;">
0.0044966
</td>
<td style="text-align:right;">
3.438361
</td>
<td style="text-align:right;">
0.0005854
</td>
</tr>
<tr>
<td style="text-align:left;">
Ni_Avail_Margin
</td>
<td style="text-align:right;">
0.0118135
</td>
<td style="text-align:right;">
0.0038614
</td>
<td style="text-align:right;">
3.059359
</td>
<td style="text-align:right;">
0.0022184
</td>
</tr>
<tr>
<td style="text-align:left;">
Ocf_Bv
</td>
<td style="text-align:right;">
-0.0198113
</td>
<td style="text-align:right;">
0.0052939
</td>
<td style="text-align:right;">
-3.742277
</td>
<td style="text-align:right;">
0.0001824
</td>
</tr>
<tr>
<td style="text-align:left;">
Pb
</td>
<td style="text-align:right;">
-0.0178971
</td>
<td style="text-align:right;">
0.0031285
</td>
<td style="text-align:right;">
-5.720637
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Pe
</td>
<td style="text-align:right;">
-0.0089908
</td>
<td style="text-align:right;">
0.0023539
</td>
<td style="text-align:right;">
-3.819565
</td>
<td style="text-align:right;">
0.0001337
</td>
</tr>
<tr>
<td style="text-align:left;">
Sales_Ps
</td>
<td style="text-align:right;">
-0.0157856
</td>
<td style="text-align:right;">
0.0046278
</td>
<td style="text-align:right;">
-3.411062
</td>
<td style="text-align:right;">
0.0006472
</td>
</tr>
<tr>
<td style="text-align:left;">
Vol1Y_Usd
</td>
<td style="text-align:right;">
0.0114250
</td>
<td style="text-align:right;">
0.0027923
</td>
<td style="text-align:right;">
4.091628
</td>
<td style="text-align:right;">
0.0000429
</td>
</tr>
<tr>
<td style="text-align:left;">
Vol3Y_Usd
</td>
<td style="text-align:right;">
0.0084587
</td>
<td style="text-align:right;">
0.0027952
</td>
<td style="text-align:right;">
3.026169
</td>
<td style="text-align:right;">
0.0024771
</td>
</tr>
</tbody>
</table>
<p>In fact, there are several indicators for the market capitalization and maybe only one would suffice, but it is not obvious to tell which one is the best choice.</p>
<p>To further depict correlation issues, we compute the correlation matrix of the predictors below (on the training sample). Because of its dimension, we show it graphically. As there are too many labels, we remove them.</p>
<div class="sourceCode" id="cb225"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb225-1"><a href="unsup.html#cb225-1"></a><span class="kw">library</span>(corrplot) <span class="co"># Package for plots of correlation matrices</span></span>
<span id="cb225-2"><a href="unsup.html#cb225-2"></a>C <-<span class="st"> </span><span class="kw">cor</span>(training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features)) <span class="co"># Correlation matrix</span></span>
<span id="cb225-3"><a href="unsup.html#cb225-3"></a><span class="kw">corrplot</span>(C, <span class="dt">tl.pos=</span><span class="st">'n'</span>) <span class="co"># Plot</span></span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:instcorrplot2"></span>
<img src="ML_factor_files/figure-html/instcorrplot2-1.png" alt="Correlation matrix of predictors." width="480" />
<p class="caption">
FIGURE 15.1: Correlation matrix of predictors.
</p>
</div>
<p>The graph of Figure <a href="unsup.html#fig:instcorrplot2">15.1</a> reveals several blue squares around the diagonal. For instance, the biggest square around the first third of features relates to all accounting ratios based on free cash flows. Because of this common term in their calculation, the features are naturally highly correlated. These local correlation patterns occur several times in the dataset and explain why it is not a good idea to use simple regression with this set of features.</p>
<p>In full disclosure, <strong>multicollinearity</strong> (when predictors are correlated) can be much less a problem for ML tools than it is for pure statistical inference. In statistics, one central goal is to study the properties of <span class="math inline">\(\beta\)</span> coefficients. Collinearity perturbs this kind of analysis. In machine learning, the aim is to maximize out-of-sample accuracy. If having many predictors can be helpful, then so be it. One simple example can help clarify this matter. When building a regression tree, having many predictors will give more options for the splits. If the features make sense, then they can be useful. The same reasoning applies to random forests and boosted trees. What does matter is that the large spectrum of features helps improve the generalization ability of the model. Their collinearity is irrelevant.</p>
<p>In the remainder of the chapter, we present two approaches that help reduce the number of predictors:</p>
<ul>
<li>the first one aims at creating new variables that are uncorrelated with each other. Low correlation is favorable from an algorithmic point of view, but the new variables lack interpretability;<br />
</li>
<li>the second one gathers predictors into homogeneous clusters and only one feature should be chosen out of this cluster. Here the rationale is reversed: interpretability is favored over statistical properties because the resulting set of features may still include high correlations, albeit to a lesser point compared to the original one.</li>
</ul>
</div>
<div id="principal-component-analysis-and-autoencoders" class="section level2">
<h2><span class="header-section-number">15.2</span> Principal component analysis and autoencoders</h2>
<p>The first method is a cornerstone in dimensionality reduction. It seeks to determine a smaller number of factors (<span class="math inline">\(K'<K\)</span>) such that:<br />
- i) the level of explanatory power remains as high as possible;<br />
- ii) the resulting factors are linear combinations of the original variables;<br />
- iii) the resulting factors are orthogonal.</p>
<div id="a-bit-of-algebra" class="section level3">
<h3><span class="header-section-number">15.2.1</span> A bit of algebra</h3>
<p>
In this short subsection, we define some key concepts that are required to fully understand the derivation of principal component analysis (PCA). Henceforth, we work with matrices (in bold fonts). An <span class="math inline">\(I \times K\)</span> matrix <span class="math inline">\(\textbf{X}\)</span> is orthonormal if <span class="math inline">\(I> K\)</span> and <span class="math inline">\(\textbf{X}'\textbf{X}=\textbf{I}_K\)</span>. When <span class="math inline">\(I=K\)</span>, the (square) matrix is called orthogonal and <span class="math inline">\(\textbf{X}'\textbf{X}=\textbf{X}\textbf{X}'=\textbf{I}_K\)</span>, i.e., <span class="math inline">\(\textbf{X}^{-1}=\textbf{X}'\)</span>.</p>
<p>One foundational result in matrix theory is the Singular Value Decomposition (SVD, see, e.g., chapter 5 in <span class="citation">Meyer (<a href="#ref-meyer2000matrix" role="doc-biblioref">2000</a>)</span>). The SVD is formulated as follows: any <span class="math inline">\(I \times K\)</span> matrix <span class="math inline">\(\textbf{X}\)</span> can be decomposed into
<span class="math display" id="eq:svd">\[\begin{equation}
\tag{15.1}
\textbf{X}=\textbf{U} \boldsymbol{\Delta} \textbf{V}',
\end{equation}\]</span>
where <span class="math inline">\(\textbf{U}\)</span> (<span class="math inline">\(I\times I\)</span>) and <span class="math inline">\(\textbf{V}\)</span> (<span class="math inline">\(K \times K\)</span>) are orthogonal and <span class="math inline">\(\boldsymbol{\Delta}\)</span> (with dimensions <span class="math inline">\(I\times K\)</span>) is diagonal, i.e., <span class="math inline">\(\Delta_{i,k}=0\)</span> whenever <span class="math inline">\(i\neq k\)</span>. In addition, <span class="math inline">\(\Delta{i,i}\ge 0\)</span>: the diagonal terms of <span class="math inline">\(\boldsymbol{\Delta}\)</span> are nonnegative.</p>
<p>For simplicity, we assume below that <span class="math inline">\(\textbf{1}_I'\textbf{X}=\textbf{0}_K'\)</span>, i.e., that all columns have zero sum (and hence zero mean).<a href="#fn32" class="footnote-ref" id="fnref32"><sup>32</sup></a> This allows to write that the covariance matrix is equal to its sample estimate <span class="math inline">\(\boldsymbol{\Sigma}_X= \frac{1}{I-1}\textbf{X}'\textbf{X}\)</span>.</p>
<p>One crucial feature of covariance matrices is their symmetry. Indeed, real-valued symmetric (square) matrices enjoy a SVD which is much more powerful: when <span class="math inline">\(\textbf{X}\)</span> is symmetric, there exist an orthogonal matrix <span class="math inline">\(\textbf{Q}\)</span> and a diagonal matrix <span class="math inline">\(\textbf{D}\)</span> such that
<span class="math display" id="eq:diagonaliz">\[\begin{equation}
\tag{15.2}
\textbf{X}=\textbf{Q}\textbf{DQ}'.
\end{equation}\]</span>
This process is called <strong>diagonalization</strong> (see chapter 7 in <span class="citation">Meyer (<a href="#ref-meyer2000matrix" role="doc-biblioref">2000</a>)</span>) and conveniently applies to covariance matrices.</p>
</div>
<div id="pca" class="section level3">
<h3><span class="header-section-number">15.2.2</span> PCA</h3>
<p>
The goal of PCA is to build a dataset <span class="math inline">\(\tilde{\textbf{X}}\)</span> that has fewer columns but that keeps as much information as possible when compressing the original one, <span class="math inline">\(\textbf{X}\)</span>. The key notion is the <strong>change of base</strong>, which is a linear transformation of <span class="math inline">\(\textbf{X}\)</span> into <span class="math inline">\(\textbf{Z}\)</span>, a matrix with identical dimension, via
<span class="math display" id="eq:pca">\[\begin{equation}
\tag{15.3}
\textbf{Z}=\textbf{XP},
\end{equation}\]</span>
where <span class="math inline">\(\textbf{P}\)</span> is a <span class="math inline">\(K \times K\)</span> matrix. There are of course an infinite number of ways to transform <span class="math inline">\(\textbf{X}\)</span> into <span class="math inline">\(\textbf{Z}\)</span>, but two fundamental constraints help reduce the possibilities. The first constraint is that the columns of <span class="math inline">\(\textbf{Z}\)</span> be uncorrelated. Having uncorrelated features is desirable because they then all tell different stories and have zero redundancy. The second constraint is that the variance of the columns of <span class="math inline">\(\textbf{Z}\)</span> is highly concentrated. This means that a few factors (columns) will capture most of the explanatory power (signal), while most (the others) will consist predominantly of noise. All of this is coded in the covariance matrix of <span class="math inline">\(\textbf{Y}\)</span>:</p>
<ul>
<li>the first condition imposes that the covariance matrix be diagonal;<br />
</li>
<li>the second condition imposes that the diagonal elements, when ranked in decreasing magnitude, see their value decline (sharply if possible).</li>
</ul>
<p>The covariance matrix of <span class="math inline">\(\textbf{Z}\)</span> is
<span class="math display" id="eq:covy">\[\begin{equation}
\tag{15.4}
\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{Z}'\textbf{Z}=\frac{1}{I-1}\textbf{P}'\textbf{X}'\textbf{XP}=\frac{1}{I-1}\textbf{P}'\boldsymbol{\Sigma}_X\textbf{P}.
\end{equation}\]</span></p>
<p>In this expression, we plug the decomposition <a href="unsup.html#eq:diagonaliz">(15.2)</a> of <span class="math inline">\(\boldsymbol{\Sigma}_X\)</span>:
<span class="math display">\[\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{P}'\textbf{Q}\textbf{DQ}'\textbf{P},\]</span>
thus picking <span class="math inline">\(\textbf{P}=\textbf{Q}\)</span>, we get, by orthogonality, <span class="math inline">\(\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{D}\)</span>, that is, a diagonal covariance matrix for <span class="math inline">\(\textbf{Z}\)</span>. The columns of <span class="math inline">\(\textbf{Z}\)</span> can then be re-shuffled in decreasing order of variance so that the diagonal elements of <span class="math inline">\(\boldsymbol{\Sigma}_Y\)</span> progressively shrink. This is useful because it helps locate the factors with most informational content (the first factors). In the limit, a constant vector (with zero variance) carries no signal.</p>
<p>The matrix <span class="math inline">\(\textbf{Z}\)</span> is a linear transformation of <span class="math inline">\(\textbf{X}\)</span>, thus, it is expected to carry the same information, even though this information is coded differently. Since the columns are ordered according to their relative importance, it is simple to omit some of them. The new set of features <span class="math inline">\(\tilde{\textbf{X}}\)</span> consists in the first <span class="math inline">\(K'\)</span> (with <span class="math inline">\(K'<K\)</span>) columns of <span class="math inline">\(\textbf{Z}\)</span>.</p>
<p>Below, we show how to perform PCA and visualize the output with the <em>factoextra</em> package. To ease readability, we use the smaller sample with few predictors.</p>
<div class="sourceCode" id="cb226"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb226-1"><a href="unsup.html#cb226-1"></a>pca <-<span class="st"> </span>training_sample <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb226-2"><a href="unsup.html#cb226-2"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="co"># Smaller number of predictors</span></span>
<span id="cb226-3"><a href="unsup.html#cb226-3"></a><span class="st"> </span><span class="kw">prcomp</span>() <span class="co"># Performs PCA</span></span>
<span id="cb226-4"><a href="unsup.html#cb226-4"></a>pca <span class="co"># Show the result</span></span></code></pre></div>
<pre><code>## Standard deviations (1, .., p=7):
## [1] 0.4536601 0.3344080 0.2994393 0.2452000 0.2352087 0.2010782 0.1140988
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5 PC6
## Div_Yld 0.27159946 -0.57909866 0.04572501 -0.52895604 -0.22662581 -0.506566090
## Eps 0.42040708 -0.15008243 -0.02476659 0.33737265 0.77137719 -0.301883295
## Mkt_Cap_12M_Usd 0.52386846 0.34323935 0.17228893 0.06249528 -0.25278113 -0.002987057
## Mom_11M_Usd 0.04723846 0.05771359 -0.89715955 0.24101481 -0.25055884 -0.258476580
## Ocf 0.53294744 0.19588990 0.18503939 0.23437100 -0.35759553 -0.049015486
## Pb 0.15241340 0.58080620 -0.22104807 -0.68213576 0.30866476 -0.038674594
## Vol1Y_Usd -0.40688963 0.38113933 0.28216181 0.15541056 -0.06157461 -0.762587677
## PC7
## Div_Yld 0.032011635
## Eps 0.011965041
## Mkt_Cap_12M_Usd 0.714319417
## Mom_11M_Usd 0.043178747
## Ocf -0.676866120
## Pb -0.168799297
## Vol1Y_Usd 0.008632062</code></pre>
<p>The rotation gives the matrix <span class="math inline">\(\textbf{P}\)</span>: it’s the tool that changes the base. The first row of the output indicates the standard deviation of each new factor (column). Each factor is indicated via a PC index (principal component). Often, the first PC (first column PC1 in the output) loads positively on all initial features: a convex weighted average of all predictors is expected to carry a lot of information. In the above example, it is almost the case, with the exception of volatility, which has a negative coefficient in the first PC. The second PC is an arbitrage between price-to-book (long) and dividend yield (short). The third PC is contrarian, as it loads heavily and negatively on momentum. Not all principal components are easy to interpret.</p>
<p>Sometimes, it can be useful to visualize the way the principal components are built. In Figure <a href="unsup.html#fig:pca2">15.2</a>, we show one popular representation that is used for two factors (usually the first two). </p>
<div class="sourceCode" id="cb228"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb228-1"><a href="unsup.html#cb228-1"></a><span class="kw">library</span>(factoextra) <span class="co"># Package for PCA visualization</span></span>
<span id="cb228-2"><a href="unsup.html#cb228-2"></a><span class="kw">fviz_pca_var</span>(pca, <span class="co"># Source of PCA decomposition</span></span>
<span id="cb228-3"><a href="unsup.html#cb228-3"></a> <span class="dt">col.var=</span><span class="st">"contrib"</span>, </span>
<span id="cb228-4"><a href="unsup.html#cb228-4"></a> <span class="dt">gradient.cols =</span> <span class="kw">c</span>(<span class="st">"#00AFBB"</span>, <span class="st">"#E7B800"</span>, <span class="st">"#FC4E07"</span>),</span>
<span id="cb228-5"><a href="unsup.html#cb228-5"></a> <span class="dt">repel =</span> <span class="ot">TRUE</span> <span class="co"># Avoid text overlapping</span></span>
<span id="cb228-6"><a href="unsup.html#cb228-6"></a>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:pca2"></span>
<img src="ML_factor_files/figure-html/pca2-1.png" alt="Visual representation of PCA with two dimensions." width="330px" height="200px" />
<p class="caption">
FIGURE 15.2: Visual representation of PCA with two dimensions.
</p>
</div>
<p>The plot shows that no initial factor has negative signs for the first two principal components. Volatility is negative for the first one and earnings per share and dividend yield are negative for the second. The numbers indicated along the axes are the proportion of explained variance of each PC. Compared to the figures in the first line of the output, the numbers are squared and then divided by the total sum of squares.</p>
<p>Once the rotation is known, it is possible to select a subsample of the transformed data. From the original 7 features, it is easy to pick just 4.</p>
<div class="sourceCode" id="cb229"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb229-1"><a href="unsup.html#cb229-1"></a>training_sample <span class="op">%>%</span><span class="st"> </span><span class="co"># Start from large sample</span></span>
<span id="cb229-2"><a href="unsup.html#cb229-2"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="co"># Keep only 7 features</span></span>
<span id="cb229-3"><a href="unsup.html#cb229-3"></a><span class="st"> </span><span class="kw">as.matrix</span>() <span class="op">%>%</span><span class="st"> </span><span class="co"># Transform in matrix</span></span>
<span id="cb229-4"><a href="unsup.html#cb229-4"></a><span class="st"> </span><span class="kw">multiply_by_matrix</span>(pca<span class="op">$</span>rotation[,<span class="dv">1</span><span class="op">:</span><span class="dv">4</span>]) <span class="op">%>%</span><span class="st"> </span><span class="co"># Rotate via PCA (first 4 columns of P)</span></span>
<span id="cb229-5"><a href="unsup.html#cb229-5"></a><span class="st"> `</span><span class="dt">colnames<-</span><span class="st">`</span>(<span class="kw">c</span>(<span class="st">"PC1"</span>, <span class="st">"PC2"</span>, <span class="st">"PC3"</span>, <span class="st">"PC4"</span>)) <span class="op">%>%</span><span class="st"> </span><span class="co"># Change column names</span></span>
<span id="cb229-6"><a href="unsup.html#cb229-6"></a><span class="st"> </span><span class="kw">head</span>() <span class="co"># Show first 6 lines</span></span></code></pre></div>
<pre><code>## PC1 PC2 PC3 PC4
## [1,] 0.3989674 0.7578132 -0.13915223 0.3132578
## [2,] 0.4284697 0.7587274 -0.40164338 0.3745255
## [3,] 0.5215295 0.5679119 -0.10533870 0.2574949
## [4,] 0.5445359 0.5335619 -0.08833864 0.2281793
## [5,] 0.5672644 0.5339749 -0.06092424 0.2320938
## [6,] 0.5871306 0.6420126 -0.44566482 0.3075399</code></pre>
<p>These 4 factors can then be used as orthogonal features in any ML engine. The fact that the features are uncorrelated is undoubtedly an asset. But the price of this convenience is high: the features are no longer immediately interpretable. De-correlating the predictors adds yet another layer of “<em>blackbox-ing</em>” in the algorithm. </p>
<p>PCA can also be used to estimate factor models. In Equation <a href="unsup.html#eq:pca">(15.3)</a>, it suffices to replace <span class="math inline">\(\textbf{Z}\)</span> with returns, <span class="math inline">\(\textbf{X}\)</span> with factor values and <span class="math inline">\(\textbf{P}\)</span> with factor loadings (see, e.g., <span class="citation">Connor and Korajczyk (<a href="#ref-connor1988risk" role="doc-biblioref">1988</a>)</span> for an early reference). More recently, <span class="citation">Lettau and Pelger (<a href="#ref-lettau2018estimating" role="doc-biblioref">2020</a><a href="#ref-lettau2018estimating" role="doc-biblioref">a</a>)</span> and <span class="citation">Lettau and Pelger (<a href="#ref-lettau2018factors" role="doc-biblioref">2020</a><a href="#ref-lettau2018factors" role="doc-biblioref">b</a>)</span> propose a thorough analysis of PCA estimation techniques. They notably argue that first moments of returns are important and should be included in the objective function, alongside the optimization on the second moments.</p>
<p>We end this subsection with a technical note. Usually, PCA is performed on the covariance matrix of returns. Sometimes, it may be preferable to decompose the <strong>correlation</strong> matrix. The result may adjust substantially if the variables have very different variances (which is not really the case in the equity space). If the investment universe encompasses several asset classes, then a correlation-based PCA will reduce the importance of the most volatile class. In this case, it is as if all returns are scaled by their respective volatilities.</p>
</div>
<div id="ae" class="section level3">
<h3><span class="header-section-number">15.2.3</span> Autoencoders</h3>
<p></p>
<p>In a PCA, the coding from <span class="math inline">\(\textbf{X}\)</span> to <span class="math inline">\(\textbf{Z}\)</span> is straightfoward, linear and works both ways:
<span class="math display">\[\textbf{Z}=\textbf{X}\textbf{P} \quad \text{and} \quad \textbf{X}=\textbf{YP}',\]</span>
so that we recover <span class="math inline">\(\textbf{X}\)</span> from <span class="math inline">\(\textbf{Z}\)</span>. This can be writen differently:
<span class="math display" id="eq:pcascheme">\[\begin{equation}
\tag{15.5}
\textbf{X} \quad \overset{\text{encode via }\textbf{P}}{\longrightarrow} \quad \textbf{Z} \quad \overset{\text{decode via } \textbf{P}'}{\longrightarrow} \quad \textbf{X}
\end{equation}\]</span></p>
<p>If we take the truncated version and seek a smaller output (with only <span class="math inline">\(K'\)</span> columns), this gives:</p>
<p><span class="math display" id="eq:pcaschem2">\[\begin{equation}
\tag{15.6}
\textbf{X}, \ (I\times K) \quad \overset{\text{encode via }\textbf{P}_{K'}}{\longrightarrow} \quad \tilde{\textbf{X}}, \ (I \times K') \quad \overset{\text{decode via } \textbf{P}'_{K'}}{\longrightarrow} \quad \breve{\textbf{X}},\ (I \times K),
\end{equation}\]</span></p>
<p>where <span class="math inline">\(\textbf{P}_{K'}\)</span> is the restriction of <span class="math inline">\(\textbf{P}\)</span> to the <span class="math inline">\(K'\)</span> columns that correspond to the factors with the largest variances. The dimensions of matrices are indicated inside the brackets. In this case, the recoding cannot recover <span class="math inline">\(\textbf{P}\)</span> exactly but only an approximation, which we write <span class="math inline">\(\breve{\textbf{X}}\)</span>. This approximation is coded with less information, hence this new data <span class="math inline">\(\breve{\textbf{X}}\)</span> is compressed and provides a parsimonious representation of the original sample <span class="math inline">\(\textbf{X}\)</span>.</p>
<p>An autoencodeur generalizes this concept to <strong>nonlinear</strong> coding functions. Simple linear autoencoders are linked to latent factor models (see Proposition 1 in for the case of single layer autoencoders.) The scheme is the following
<span class="math display" id="eq:aescheme2">\[\begin{equation}
\tag{15.7}
\textbf{X},\ (I\times K) \quad \overset{\text{encode via } N} {\longrightarrow} \quad \tilde{\textbf{X}}=N(\textbf{X}), \ (I \times K') \quad \overset{\text{decode via } N'}{\longrightarrow} \quad \breve{\textbf{X}}=N'(\tilde{\textbf{X}}), \ (I \times K),
\end{equation}\]</span></p>
<p>where the encoding and decoding functions <span class="math inline">\(N\)</span> and <span class="math inline">\(N'\)</span> are often taken to be neural networks. The term <strong>autoencoder</strong> comes from the fact that the target output, which we often write <span class="math inline">\(\textbf{Z}\)</span> is the original sample <span class="math inline">\(\textbf{X}\)</span>. Thus, the algorithm seeks to determine the function <span class="math inline">\(N\)</span> that minimizes the distance (to be defined) between <span class="math inline">\(\textbf{X}\)</span> and the output value <span class="math inline">\(\breve{\textbf{X}}\)</span>. The encoder generates an alternative representation of <span class="math inline">\(\textbf{X}\)</span>, whereas the decoder tries to recode it back to its original values. Naturally, the intermediate (coded) version <span class="math inline">\(\tilde{\textbf{X}}\)</span> is targeted to have a smaller dimension compared to <span class="math inline">\(\textbf{X}\)</span>.</p>
</div>
<div id="application" class="section level3">
<h3><span class="header-section-number">15.2.4</span> Application</h3>
<p>
Autoencoders are easy to code in Keras (see Chapter <a href="NN.html#NN">7</a> for more details on Keras). To underline the power of the framework, we resort to another way of coding a NN: the so-called functional API. For simplicity, we work with the small number of predictors (7). The structure of the network consists of two symmetric networks with only one intermediate layer containing 32 units. The activation function is sigmoid; this makes sense since the input has values in the unit interval.</p>
<div class="sourceCode" id="cb231"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb231-1"><a href="unsup.html#cb231-1"></a>input_layer <-<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="kw">c</span>(<span class="dv">7</span>)) <span class="co"># features_short has 7 columns </span></span>
<span id="cb231-2"><a href="unsup.html#cb231-2"></a></span>
<span id="cb231-3"><a href="unsup.html#cb231-3"></a>encoder <-<span class="st"> </span>input_layer <span class="op">%>%</span><span class="st"> </span><span class="co"># First, encode</span></span>
<span id="cb231-4"><a href="unsup.html#cb231-4"></a><span class="st"> </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">32</span>, <span class="dt">activation =</span> <span class="st">"sigmoid"</span>) <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb231-5"><a href="unsup.html#cb231-5"></a><span class="st"> </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">4</span>) <span class="co"># 4 dimensions for the output layer (same as PCA example)</span></span>
<span id="cb231-6"><a href="unsup.html#cb231-6"></a></span>
<span id="cb231-7"><a href="unsup.html#cb231-7"></a>decoder <-<span class="st"> </span>encoder <span class="op">%>%</span><span class="st"> </span><span class="co"># Then, from encoder, decode</span></span>
<span id="cb231-8"><a href="unsup.html#cb231-8"></a><span class="st"> </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">32</span>, <span class="dt">activation =</span> <span class="st">"sigmoid"</span>) <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb231-9"><a href="unsup.html#cb231-9"></a><span class="st"> </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">7</span>) <span class="co"># the original sample has 7 features</span></span></code></pre></div>
<p>In the training part, we optimize the MSE and use an Adam update of the weights (see Section <a href="NN.html#backprop">7.2.3</a>).</p>
<div class="sourceCode" id="cb232"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb232-1"><a href="unsup.html#cb232-1"></a>ae_model <-<span class="st"> </span><span class="kw">keras_model</span>(<span class="dt">inputs =</span> input_layer, <span class="dt">outputs =</span> decoder) <span class="co"># Builds the model</span></span>
<span id="cb232-2"><a href="unsup.html#cb232-2"></a></span>
<span id="cb232-3"><a href="unsup.html#cb232-3"></a>ae_model <span class="op">%>%</span><span class="st"> </span><span class="kw">compile</span>( <span class="co"># Learning parameters</span></span>
<span id="cb232-4"><a href="unsup.html#cb232-4"></a> <span class="dt">loss =</span> <span class="st">'mean_squared_error'</span>,</span>
<span id="cb232-5"><a href="unsup.html#cb232-5"></a> <span class="dt">optimizer =</span> <span class="st">'adam'</span>,</span>
<span id="cb232-6"><a href="unsup.html#cb232-6"></a> <span class="dt">metrics =</span> <span class="kw">c</span>(<span class="st">'mean_absolute_error'</span>)</span>
<span id="cb232-7"><a href="unsup.html#cb232-7"></a>)</span></code></pre></div>
<p>Finally, we are ready to train the data onto itself! The evolution of loss on the training and testing samples is depicted in Figure <a href="unsup.html#fig:aekeras3">15.3</a>. The decreasing pattern shows the progress of the quality in compression.</p>
<div class="sourceCode" id="cb233"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb233-1"><a href="unsup.html#cb233-1"></a>fit_ae <-<span class="st"> </span>ae_model <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb233-2"><a href="unsup.html#cb233-2"></a><span class="st"> </span><span class="kw">fit</span>(training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(), <span class="co"># Input</span></span>
<span id="cb233-3"><a href="unsup.html#cb233-3"></a> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(), <span class="co"># Output</span></span>
<span id="cb233-4"><a href="unsup.html#cb233-4"></a> <span class="dt">epochs =</span> <span class="dv">15</span>, <span class="dt">batch_size =</span> <span class="dv">512</span>,</span>
<span id="cb233-5"><a href="unsup.html#cb233-5"></a> <span class="dt">validation_data =</span> <span class="kw">list</span>(testing_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(), </span>
<span id="cb233-6"><a href="unsup.html#cb233-6"></a> testing_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>())</span>
<span id="cb233-7"><a href="unsup.html#cb233-7"></a> )</span>
<span id="cb233-8"><a href="unsup.html#cb233-8"></a><span class="kw">plot</span>(fit_ae) <span class="op">+</span><span class="st"> </span><span class="kw">theme_grey</span>()</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:aekeras3"></span>
<img src="ML_factor_files/figure-html/aekeras3-1.png" alt="Output from the training of an autoencoder." width="400px" />
<p class="caption">
FIGURE 15.3: Output from the training of an autoencoder.
</p>
</div>
<p>In order to get the details of all weights and biases, the syntax is the following.</p>
<div class="sourceCode" id="cb234"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb234-1"><a href="unsup.html#cb234-1"></a>ae_weights <-<span class="st"> </span>ae_model <span class="op">%>%</span><span class="st"> </span><span class="kw">get_weights</span>()</span></code></pre></div>