unsup.html

<!DOCTYPE html>
<html lang="" xml:lang="">
<head>

  <meta charset="utf-8" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <title>Chapter 15 Unsupervised learning | Machine Learning for Factor Investing</title>
  <meta name="description" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
  <meta name="generator" content="bookdown 0.21 and GitBook 2.6.7" />

  <meta property="og:title" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
  <meta property="og:type" content="book" />
  
  
  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Chapter 15 Unsupervised learning | Machine Learning for Factor Investing" />
  
  
<meta name="author" content="Guillaume Coqueret and Tony Guida" />


<meta name="date" content="2021-01-08" />

  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="apple-mobile-web-app-capable" content="yes" />
  <meta name="apple-mobile-web-app-status-bar-style" content="black" />
  
  
<link rel="prev" href="causality.html"/>
<link rel="next" href="RL.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />


<script src="libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0/anchor-sections.js"></script>
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<link href="libs/lightable-0.0.1/lightable.css" rel="stylesheet" />


<style type="text/css">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>

</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html"><i class="fa fa-check"></i>Preface</a><ul>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#what-this-book-is-not-about"><i class="fa fa-check"></i>What this book is not about</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#the-targeted-audience"><i class="fa fa-check"></i>The targeted audience</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#how-this-book-is-structured"><i class="fa fa-check"></i>How this book is structured</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#companion-website"><i class="fa fa-check"></i>Companion website</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#why-r"><i class="fa fa-check"></i>Why R?</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#coding-instructions"><i class="fa fa-check"></i>Coding instructions</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#acknowledgments"><i class="fa fa-check"></i>Acknowledgments</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#future-developments"><i class="fa fa-check"></i>Future developments</a></li>
</ul></li>
<li class="part"><span><b>I Introduction</b></span></li>
<li class="chapter" data-level="1" data-path="notdata.html"><a href="notdata.html"><i class="fa fa-check"></i><b>1</b> Notations and data</a><ul>
<li class="chapter" data-level="1.1" data-path="notdata.html"><a href="notdata.html#notations"><i class="fa fa-check"></i><b>1.1</b> Notations</a></li>
<li class="chapter" data-level="1.2" data-path="notdata.html"><a href="notdata.html#dataset"><i class="fa fa-check"></i><b>1.2</b> Dataset</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
<li class="chapter" data-level="2.1" data-path="intro.html"><a href="intro.html#context"><i class="fa fa-check"></i><b>2.1</b> Context</a></li>
<li class="chapter" data-level="2.2" data-path="intro.html"><a href="intro.html#portfolio-construction-the-workflow"><i class="fa fa-check"></i><b>2.2</b> Portfolio construction: the workflow</a></li>
<li class="chapter" data-level="2.3" data-path="intro.html"><a href="intro.html#machine-learning-is-no-magic-wand"><i class="fa fa-check"></i><b>2.3</b> Machine learning is no magic wand</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="factor.html"><a href="factor.html"><i class="fa fa-check"></i><b>3</b> Factor investing and asset pricing anomalies</a><ul>
<li class="chapter" data-level="3.1" data-path="factor.html"><a href="factor.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
<li class="chapter" data-level="3.2" data-path="factor.html"><a href="factor.html#detecting-anomalies"><i class="fa fa-check"></i><b>3.2</b> Detecting anomalies</a><ul>
<li class="chapter" data-level="3.2.1" data-path="factor.html"><a href="factor.html#challenges"><i class="fa fa-check"></i><b>3.2.1</b> Challenges</a></li>
<li class="chapter" data-level="3.2.2" data-path="factor.html"><a href="factor.html#simple-portfolio-sorts"><i class="fa fa-check"></i><b>3.2.2</b> Simple portfolio sorts  </a></li>
<li class="chapter" data-level="3.2.3" data-path="factor.html"><a href="factor.html#factors"><i class="fa fa-check"></i><b>3.2.3</b> Factors</a></li>
<li class="chapter" data-level="3.2.4" data-path="factor.html"><a href="factor.html#predictive-regressions-sorts-and-p-value-issues"><i class="fa fa-check"></i><b>3.2.4</b> Predictive regressions, sorts, and p-value issues</a></li>
<li class="chapter" data-level="3.2.5" data-path="factor.html"><a href="factor.html#fama-macbeth-regressions"><i class="fa fa-check"></i><b>3.2.5</b> Fama-Macbeth regressions</a></li>
<li class="chapter" data-level="3.2.6" data-path="factor.html"><a href="factor.html#factor-competition"><i class="fa fa-check"></i><b>3.2.6</b> Factor competition</a></li>
<li class="chapter" data-level="3.2.7" data-path="factor.html"><a href="factor.html#advanced-techniques"><i class="fa fa-check"></i><b>3.2.7</b> Advanced techniques</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="factor.html"><a href="factor.html#factors-or-characteristics"><i class="fa fa-check"></i><b>3.3</b> Factors or characteristics?</a></li>
<li class="chapter" data-level="3.4" data-path="factor.html"><a href="factor.html#hot-topics-momentum-timing-and-esg"><i class="fa fa-check"></i><b>3.4</b> Hot topics: momentum, timing and ESG</a><ul>
<li class="chapter" data-level="3.4.1" data-path="factor.html"><a href="factor.html#factor-momentum"><i class="fa fa-check"></i><b>3.4.1</b> Factor momentum</a></li>
<li class="chapter" data-level="3.4.2" data-path="factor.html"><a href="factor.html#factor-timing"><i class="fa fa-check"></i><b>3.4.2</b> Factor timing</a></li>
<li class="chapter" data-level="3.4.3" data-path="factor.html"><a href="factor.html#the-green-factors"><i class="fa fa-check"></i><b>3.4.3</b> The green factors</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="factor.html"><a href="factor.html#the-links-with-machine-learning"><i class="fa fa-check"></i><b>3.5</b> The links with machine learning</a><ul>
<li class="chapter" data-level="3.5.1" data-path="factor.html"><a href="factor.html#a-short-list-of-recent-references"><i class="fa fa-check"></i><b>3.5.1</b> A short list of recent references</a></li>
<li class="chapter" data-level="3.5.2" data-path="factor.html"><a href="factor.html#explicit-connections-with-asset-pricing-models"><i class="fa fa-check"></i><b>3.5.2</b> Explicit connections with asset pricing models</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="factor.html"><a href="factor.html#coding-exercises"><i class="fa fa-check"></i><b>3.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="Data.html"><a href="Data.html"><i class="fa fa-check"></i><b>4</b> Data preprocessing</a><ul>
<li class="chapter" data-level="4.1" data-path="Data.html"><a href="Data.html#know-your-data"><i class="fa fa-check"></i><b>4.1</b> Know your data</a></li>
<li class="chapter" data-level="4.2" data-path="Data.html"><a href="Data.html#missing-data"><i class="fa fa-check"></i><b>4.2</b> Missing data</a></li>
<li class="chapter" data-level="4.3" data-path="Data.html"><a href="Data.html#outlier-detection"><i class="fa fa-check"></i><b>4.3</b> Outlier detection</a></li>
<li class="chapter" data-level="4.4" data-path="Data.html"><a href="Data.html#feateng"><i class="fa fa-check"></i><b>4.4</b> Feature engineering</a><ul>
<li class="chapter" data-level="4.4.1" data-path="Data.html"><a href="Data.html#feature-selection"><i class="fa fa-check"></i><b>4.4.1</b> Feature selection</a></li>
<li class="chapter" data-level="4.4.2" data-path="Data.html"><a href="Data.html#scaling"><i class="fa fa-check"></i><b>4.4.2</b> Scaling the predictors</a></li>
</ul></li>
<li class="chapter" data-level="4.5" data-path="Data.html"><a href="Data.html#labelling"><i class="fa fa-check"></i><b>4.5</b> Labelling</a><ul>
<li class="chapter" data-level="4.5.1" data-path="Data.html"><a href="Data.html#simple-labels"><i class="fa fa-check"></i><b>4.5.1</b> Simple labels</a></li>
<li class="chapter" data-level="4.5.2" data-path="Data.html"><a href="Data.html#categorical-labels"><i class="fa fa-check"></i><b>4.5.2</b> Categorical labels</a></li>
<li class="chapter" data-level="4.5.3" data-path="Data.html"><a href="Data.html#the-triple-barrier-method"><i class="fa fa-check"></i><b>4.5.3</b> The triple barrier method</a></li>
<li class="chapter" data-level="4.5.4" data-path="Data.html"><a href="Data.html#filtering-the-sample"><i class="fa fa-check"></i><b>4.5.4</b> Filtering the sample</a></li>
<li class="chapter" data-level="4.5.5" data-path="Data.html"><a href="Data.html#horizons"><i class="fa fa-check"></i><b>4.5.5</b> Return horizons</a></li>
</ul></li>
<li class="chapter" data-level="4.6" data-path="Data.html"><a href="Data.html#pers"><i class="fa fa-check"></i><b>4.6</b> Handling persistence</a></li>
<li class="chapter" data-level="4.7" data-path="Data.html"><a href="Data.html#extensions"><i class="fa fa-check"></i><b>4.7</b> Extensions</a><ul>
<li class="chapter" data-level="4.7.1" data-path="Data.html"><a href="Data.html#transforming-features"><i class="fa fa-check"></i><b>4.7.1</b> Transforming features</a></li>
<li class="chapter" data-level="4.7.2" data-path="Data.html"><a href="Data.html#macrovar"><i class="fa fa-check"></i><b>4.7.2</b> Macro-economic variables</a></li>
<li class="chapter" data-level="4.7.3" data-path="Data.html"><a href="Data.html#active-learning"><i class="fa fa-check"></i><b>4.7.3</b> Active learning</a></li>
</ul></li>
<li class="chapter" data-level="4.8" data-path="Data.html"><a href="Data.html#additional-code-and-results"><i class="fa fa-check"></i><b>4.8</b> Additional code and results</a><ul>
<li class="chapter" data-level="4.8.1" data-path="Data.html"><a href="Data.html#impact-of-rescaling-graphical-representation"><i class="fa fa-check"></i><b>4.8.1</b> Impact of rescaling: graphical representation</a></li>
<li class="chapter" data-level="4.8.2" data-path="Data.html"><a href="Data.html#impact-of-rescaling-toy-example"><i class="fa fa-check"></i><b>4.8.2</b> Impact of rescaling: toy example</a></li>
</ul></li>
<li class="chapter" data-level="4.9" data-path="Data.html"><a href="Data.html#coding-exercises-1"><i class="fa fa-check"></i><b>4.9</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>II Common supervised algorithms</b></span></li>
<li class="chapter" data-level="5" data-path="lasso.html"><a href="lasso.html"><i class="fa fa-check"></i><b>5</b> Penalized regressions and sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.1" data-path="lasso.html"><a href="lasso.html#penalized-regressions"><i class="fa fa-check"></i><b>5.1</b> Penalized regressions</a><ul>
<li class="chapter" data-level="5.1.1" data-path="lasso.html"><a href="lasso.html#penreg"><i class="fa fa-check"></i><b>5.1.1</b> Simple regressions</a></li>
<li class="chapter" data-level="5.1.2" data-path="lasso.html"><a href="lasso.html#forms-of-penalizations"><i class="fa fa-check"></i><b>5.1.2</b> Forms of penalizations</a></li>
<li class="chapter" data-level="5.1.3" data-path="lasso.html"><a href="lasso.html#illustrations"><i class="fa fa-check"></i><b>5.1.3</b> Illustrations</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="lasso.html"><a href="lasso.html#sparse-hedging-for-minimum-variance-portfolios"><i class="fa fa-check"></i><b>5.2</b> Sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.2.1" data-path="lasso.html"><a href="lasso.html#presentation-and-derivations"><i class="fa fa-check"></i><b>5.2.1</b> Presentation and derivations</a></li>
<li class="chapter" data-level="5.2.2" data-path="lasso.html"><a href="lasso.html#sparseex"><i class="fa fa-check"></i><b>5.2.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="lasso.html"><a href="lasso.html#predictive-regressions"><i class="fa fa-check"></i><b>5.3</b> Predictive regressions</a><ul>
<li class="chapter" data-level="5.3.1" data-path="lasso.html"><a href="lasso.html#literature-review-and-principle"><i class="fa fa-check"></i><b>5.3.1</b> Literature review and principle</a></li>
<li class="chapter" data-level="5.3.2" data-path="lasso.html"><a href="lasso.html#code-and-results"><i class="fa fa-check"></i><b>5.3.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="lasso.html"><a href="lasso.html#coding-exercise"><i class="fa fa-check"></i><b>5.4</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="trees.html"><a href="trees.html"><i class="fa fa-check"></i><b>6</b> Tree-based methods</a><ul>
<li class="chapter" data-level="6.1" data-path="trees.html"><a href="trees.html#simple-trees"><i class="fa fa-check"></i><b>6.1</b> Simple trees</a><ul>
<li class="chapter" data-level="6.1.1" data-path="trees.html"><a href="trees.html#principle"><i class="fa fa-check"></i><b>6.1.1</b> Principle</a></li>
<li class="chapter" data-level="6.1.2" data-path="trees.html"><a href="trees.html#treeclass"><i class="fa fa-check"></i><b>6.1.2</b> Further details on classification</a></li>
<li class="chapter" data-level="6.1.3" data-path="trees.html"><a href="trees.html#pruning-criteria"><i class="fa fa-check"></i><b>6.1.3</b> Pruning criteria</a></li>
<li class="chapter" data-level="6.1.4" data-path="trees.html"><a href="trees.html#code-and-interpretation"><i class="fa fa-check"></i><b>6.1.4</b> Code and interpretation</a></li>
</ul></li>
<li class="chapter" data-level="6.2" data-path="trees.html"><a href="trees.html#random-forests"><i class="fa fa-check"></i><b>6.2</b> Random forests</a><ul>
<li class="chapter" data-level="6.2.1" data-path="trees.html"><a href="trees.html#principle-1"><i class="fa fa-check"></i><b>6.2.1</b> Principle</a></li>
<li class="chapter" data-level="6.2.2" data-path="trees.html"><a href="trees.html#code-and-results-1"><i class="fa fa-check"></i><b>6.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="6.3" data-path="trees.html"><a href="trees.html#adaboost"><i class="fa fa-check"></i><b>6.3</b> Boosted trees: Adaboost</a><ul>
<li class="chapter" data-level="6.3.1" data-path="trees.html"><a href="trees.html#methodology"><i class="fa fa-check"></i><b>6.3.1</b> Methodology</a></li>
<li class="chapter" data-level="6.3.2" data-path="trees.html"><a href="trees.html#illustration"><i class="fa fa-check"></i><b>6.3.2</b> Illustration</a></li>
</ul></li>
<li class="chapter" data-level="6.4" data-path="trees.html"><a href="trees.html#boosted-trees-extreme-gradient-boosting"><i class="fa fa-check"></i><b>6.4</b> Boosted trees: extreme gradient boosting</a><ul>
<li class="chapter" data-level="6.4.1" data-path="trees.html"><a href="trees.html#managing-loss"><i class="fa fa-check"></i><b>6.4.1</b> Managing loss</a></li>
<li class="chapter" data-level="6.4.2" data-path="trees.html"><a href="trees.html#penalization"><i class="fa fa-check"></i><b>6.4.2</b> Penalization</a></li>
<li class="chapter" data-level="6.4.3" data-path="trees.html"><a href="trees.html#aggregation"><i class="fa fa-check"></i><b>6.4.3</b> Aggregation</a></li>
<li class="chapter" data-level="6.4.4" data-path="trees.html"><a href="trees.html#tree-structure"><i class="fa fa-check"></i><b>6.4.4</b> Tree structure</a></li>
<li class="chapter" data-level="6.4.5" data-path="trees.html"><a href="trees.html#boostext"><i class="fa fa-check"></i><b>6.4.5</b> Extensions</a></li>
<li class="chapter" data-level="6.4.6" data-path="trees.html"><a href="trees.html#boostcode"><i class="fa fa-check"></i><b>6.4.6</b> Code and results</a></li>
<li class="chapter" data-level="6.4.7" data-path="trees.html"><a href="trees.html#instweight"><i class="fa fa-check"></i><b>6.4.7</b> Instance weighting</a></li>
</ul></li>
<li class="chapter" data-level="6.5" data-path="trees.html"><a href="trees.html#discussion"><i class="fa fa-check"></i><b>6.5</b> Discussion</a></li>
<li class="chapter" data-level="6.6" data-path="trees.html"><a href="trees.html#coding-exercises-2"><i class="fa fa-check"></i><b>6.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="NN.html"><a href="NN.html"><i class="fa fa-check"></i><b>7</b> Neural networks</a><ul>
<li class="chapter" data-level="7.1" data-path="NN.html"><a href="NN.html#the-original-perceptron"><i class="fa fa-check"></i><b>7.1</b> The original perceptron</a></li>
<li class="chapter" data-level="7.2" data-path="NN.html"><a href="NN.html#multilayer-perceptron"><i class="fa fa-check"></i><b>7.2</b> Multilayer perceptron</a><ul>
<li class="chapter" data-level="7.2.1" data-path="NN.html"><a href="NN.html#introduction-and-notations"><i class="fa fa-check"></i><b>7.2.1</b> Introduction and notations</a></li>
<li class="chapter" data-level="7.2.2" data-path="NN.html"><a href="NN.html#universal-approximation"><i class="fa fa-check"></i><b>7.2.2</b> Universal approximation</a></li>
<li class="chapter" data-level="7.2.3" data-path="NN.html"><a href="NN.html#backprop"><i class="fa fa-check"></i><b>7.2.3</b> Learning via back-propagation</a></li>
<li class="chapter" data-level="7.2.4" data-path="NN.html"><a href="NN.html#further-details-on-classification"><i class="fa fa-check"></i><b>7.2.4</b> Further details on classification</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="NN.html"><a href="NN.html#howdeep"><i class="fa fa-check"></i><b>7.3</b> How deep we should go and other practical issues</a><ul>
<li class="chapter" data-level="7.3.1" data-path="NN.html"><a href="NN.html#architectural-choices"><i class="fa fa-check"></i><b>7.3.1</b> Architectural choices</a></li>
<li class="chapter" data-level="7.3.2" data-path="NN.html"><a href="NN.html#frequency-of-weight-updates-and-learning-duration"><i class="fa fa-check"></i><b>7.3.2</b> Frequency of weight updates and learning duration</a></li>
<li class="chapter" data-level="7.3.3" data-path="NN.html"><a href="NN.html#penalizations-and-dropout"><i class="fa fa-check"></i><b>7.3.3</b> Penalizations and dropout</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="NN.html"><a href="NN.html#code-samples-and-comments-for-vanilla-mlp"><i class="fa fa-check"></i><b>7.4</b> Code samples and comments for vanilla MLP</a><ul>
<li class="chapter" data-level="7.4.1" data-path="NN.html"><a href="NN.html#regression-example"><i class="fa fa-check"></i><b>7.4.1</b> Regression example</a></li>
<li class="chapter" data-level="7.4.2" data-path="NN.html"><a href="NN.html#classification-example"><i class="fa fa-check"></i><b>7.4.2</b> Classification example</a></li>
<li class="chapter" data-level="7.4.3" data-path="NN.html"><a href="NN.html#custloss"><i class="fa fa-check"></i><b>7.4.3</b> Custom losses</a></li>
</ul></li>
<li class="chapter" data-level="7.5" data-path="NN.html"><a href="NN.html#recurrent-networks"><i class="fa fa-check"></i><b>7.5</b> Recurrent networks</a><ul>
<li class="chapter" data-level="7.5.1" data-path="NN.html"><a href="NN.html#presentation"><i class="fa fa-check"></i><b>7.5.1</b> Presentation</a></li>
<li class="chapter" data-level="7.5.2" data-path="NN.html"><a href="NN.html#code-and-results-2"><i class="fa fa-check"></i><b>7.5.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="NN.html"><a href="NN.html#other-common-architectures"><i class="fa fa-check"></i><b>7.6</b> Other common architectures</a><ul>
<li class="chapter" data-level="7.6.1" data-path="NN.html"><a href="NN.html#generative-aversarial-networks"><i class="fa fa-check"></i><b>7.6.1</b> Generative adversarial networks</a></li>
<li class="chapter" data-level="7.6.2" data-path="NN.html"><a href="NN.html#autoencoders"><i class="fa fa-check"></i><b>7.6.2</b> Autoencoders</a></li>
<li class="chapter" data-level="7.6.3" data-path="NN.html"><a href="NN.html#a-word-on-convolutional-networks"><i class="fa fa-check"></i><b>7.6.3</b> A word on convolutional networks</a></li>
<li class="chapter" data-level="7.6.4" data-path="NN.html"><a href="NN.html#advanced-architectures"><i class="fa fa-check"></i><b>7.6.4</b> Advanced architectures</a></li>
</ul></li>
<li class="chapter" data-level="7.7" data-path="NN.html"><a href="NN.html#coding-exercise-1"><i class="fa fa-check"></i><b>7.7</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>8</b> Support vector machines</a><ul>
<li class="chapter" data-level="8.1" data-path="svm.html"><a href="svm.html#svm-for-classification"><i class="fa fa-check"></i><b>8.1</b> SVM for classification</a></li>
<li class="chapter" data-level="8.2" data-path="svm.html"><a href="svm.html#svm-for-regression"><i class="fa fa-check"></i><b>8.2</b> SVM for regression</a></li>
<li class="chapter" data-level="8.3" data-path="svm.html"><a href="svm.html#practice"><i class="fa fa-check"></i><b>8.3</b> Practice</a></li>
<li class="chapter" data-level="8.4" data-path="svm.html"><a href="svm.html#coding-exercises-3"><i class="fa fa-check"></i><b>8.4</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="bayes.html"><a href="bayes.html"><i class="fa fa-check"></i><b>9</b> Bayesian methods</a><ul>
<li class="chapter" data-level="9.1" data-path="bayes.html"><a href="bayes.html#the-bayesian-framework"><i class="fa fa-check"></i><b>9.1</b> The Bayesian framework</a></li>
<li class="chapter" data-level="9.2" data-path="bayes.html"><a href="bayes.html#bayesian-sampling"><i class="fa fa-check"></i><b>9.2</b> Bayesian sampling</a><ul>
<li class="chapter" data-level="9.2.1" data-path="bayes.html"><a href="bayes.html#gibbs-sampling"><i class="fa fa-check"></i><b>9.2.1</b> Gibbs sampling</a></li>
<li class="chapter" data-level="9.2.2" data-path="bayes.html"><a href="bayes.html#metropolis-hastings-sampling"><i class="fa fa-check"></i><b>9.2.2</b> Metropolis-Hastings sampling</a></li>
</ul></li>
<li class="chapter" data-level="9.3" data-path="bayes.html"><a href="bayes.html#bayesian-linear-regression"><i class="fa fa-check"></i><b>9.3</b> Bayesian linear regression</a></li>
<li class="chapter" data-level="9.4" data-path="bayes.html"><a href="bayes.html#naive-bayes-classifier"><i class="fa fa-check"></i><b>9.4</b> Naive Bayes classifier</a></li>
<li class="chapter" data-level="9.5" data-path="bayes.html"><a href="bayes.html#BART"><i class="fa fa-check"></i><b>9.5</b> Bayesian additive trees</a><ul>
<li class="chapter" data-level="9.5.1" data-path="bayes.html"><a href="bayes.html#general-formulation"><i class="fa fa-check"></i><b>9.5.1</b> General formulation</a></li>
<li class="chapter" data-level="9.5.2" data-path="bayes.html"><a href="bayes.html#priors"><i class="fa fa-check"></i><b>9.5.2</b> Priors</a></li>
<li class="chapter" data-level="9.5.3" data-path="bayes.html"><a href="bayes.html#sampling-and-predictions"><i class="fa fa-check"></i><b>9.5.3</b> Sampling and predictions</a></li>
<li class="chapter" data-level="9.5.4" data-path="bayes.html"><a href="bayes.html#code"><i class="fa fa-check"></i><b>9.5.4</b> Code</a></li>
</ul></li>
</ul></li>
<li class="part"><span><b>III From predictions to portfolios</b></span></li>
<li class="chapter" data-level="10" data-path="valtune.html"><a href="valtune.html"><i class="fa fa-check"></i><b>10</b> Validating and tuning</a><ul>
<li class="chapter" data-level="10.1" data-path="valtune.html"><a href="valtune.html#mlmetrics"><i class="fa fa-check"></i><b>10.1</b> Learning metrics</a><ul>
<li class="chapter" data-level="10.1.1" data-path="valtune.html"><a href="valtune.html#regression-analysis"><i class="fa fa-check"></i><b>10.1.1</b> Regression analysis</a></li>
<li class="chapter" data-level="10.1.2" data-path="valtune.html"><a href="valtune.html#classification-analysis"><i class="fa fa-check"></i><b>10.1.2</b> Classification analysis</a></li>
</ul></li>
<li class="chapter" data-level="10.2" data-path="valtune.html"><a href="valtune.html#validation"><i class="fa fa-check"></i><b>10.2</b> Validation</a><ul>
<li class="chapter" data-level="10.2.1" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-theory"><i class="fa fa-check"></i><b>10.2.1</b> The variance-bias tradeoff: theory</a></li>
<li class="chapter" data-level="10.2.2" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-illustration"><i class="fa fa-check"></i><b>10.2.2</b> The variance-bias tradeoff: illustration</a></li>
<li class="chapter" data-level="10.2.3" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-principle"><i class="fa fa-check"></i><b>10.2.3</b> The risk of overfitting: principle</a></li>
<li class="chapter" data-level="10.2.4" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-some-solutions"><i class="fa fa-check"></i><b>10.2.4</b> The risk of overfitting: some solutions</a></li>
</ul></li>
<li class="chapter" data-level="10.3" data-path="valtune.html"><a href="valtune.html#the-search-for-good-hyperparameters"><i class="fa fa-check"></i><b>10.3</b> The search for good hyperparameters</a><ul>
<li class="chapter" data-level="10.3.1" data-path="valtune.html"><a href="valtune.html#methods"><i class="fa fa-check"></i><b>10.3.1</b> Methods</a></li>
<li class="chapter" data-level="10.3.2" data-path="valtune.html"><a href="valtune.html#example-grid-search"><i class="fa fa-check"></i><b>10.3.2</b> Example: grid search</a></li>
<li class="chapter" data-level="10.3.3" data-path="valtune.html"><a href="valtune.html#example-bayesian-optimization"><i class="fa fa-check"></i><b>10.3.3</b> Example: Bayesian optimization</a></li>
</ul></li>
<li class="chapter" data-level="10.4" data-path="valtune.html"><a href="valtune.html#short-discussion-on-validation-in-backtests"><i class="fa fa-check"></i><b>10.4</b> Short discussion on validation in backtests</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="ensemble.html"><a href="ensemble.html"><i class="fa fa-check"></i><b>11</b> Ensemble models</a><ul>
<li class="chapter" data-level="11.1" data-path="ensemble.html"><a href="ensemble.html#linear-ensembles"><i class="fa fa-check"></i><b>11.1</b> Linear ensembles</a><ul>
<li class="chapter" data-level="11.1.1" data-path="ensemble.html"><a href="ensemble.html#principles"><i class="fa fa-check"></i><b>11.1.1</b> Principles</a></li>
<li class="chapter" data-level="11.1.2" data-path="ensemble.html"><a href="ensemble.html#example"><i class="fa fa-check"></i><b>11.1.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="11.2" data-path="ensemble.html"><a href="ensemble.html#stacked-ensembles"><i class="fa fa-check"></i><b>11.2</b> Stacked ensembles</a><ul>
<li class="chapter" data-level="11.2.1" data-path="ensemble.html"><a href="ensemble.html#two-stage-training"><i class="fa fa-check"></i><b>11.2.1</b> Two-stage training</a></li>
<li class="chapter" data-level="11.2.2" data-path="ensemble.html"><a href="ensemble.html#code-and-results-3"><i class="fa fa-check"></i><b>11.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="11.3" data-path="ensemble.html"><a href="ensemble.html#extensions-1"><i class="fa fa-check"></i><b>11.3</b> Extensions</a><ul>
<li class="chapter" data-level="11.3.1" data-path="ensemble.html"><a href="ensemble.html#exogenous-variables"><i class="fa fa-check"></i><b>11.3.1</b> Exogenous variables</a></li>
<li class="chapter" data-level="11.3.2" data-path="ensemble.html"><a href="ensemble.html#shrinking-inter-model-correlations"><i class="fa fa-check"></i><b>11.3.2</b> Shrinking inter-model correlations</a></li>
</ul></li>
<li class="chapter" data-level="11.4" data-path="ensemble.html"><a href="ensemble.html#exercise"><i class="fa fa-check"></i><b>11.4</b> Exercise</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="backtest.html"><a href="backtest.html"><i class="fa fa-check"></i><b>12</b> Portfolio backtesting</a><ul>
<li class="chapter" data-level="12.1" data-path="backtest.html"><a href="backtest.html#protocol"><i class="fa fa-check"></i><b>12.1</b> Setting the protocol</a></li>
<li class="chapter" data-level="12.2" data-path="backtest.html"><a href="backtest.html#turning-signals-into-portfolio-weights"><i class="fa fa-check"></i><b>12.2</b> Turning signals into portfolio weights</a></li>
<li class="chapter" data-level="12.3" data-path="backtest.html"><a href="backtest.html#perfmet"><i class="fa fa-check"></i><b>12.3</b> Performance metrics</a><ul>
<li class="chapter" data-level="12.3.1" data-path="backtest.html"><a href="backtest.html#discussion-1"><i class="fa fa-check"></i><b>12.3.1</b> Discussion</a></li>
<li class="chapter" data-level="12.3.2" data-path="backtest.html"><a href="backtest.html#pure-performance-and-risk-indicators"><i class="fa fa-check"></i><b>12.3.2</b> Pure performance and risk indicators</a></li>
<li class="chapter" data-level="12.3.3" data-path="backtest.html"><a href="backtest.html#factor-based-evaluation"><i class="fa fa-check"></i><b>12.3.3</b> Factor-based evaluation</a></li>
<li class="chapter" data-level="12.3.4" data-path="backtest.html"><a href="backtest.html#risk-adjusted-measures"><i class="fa fa-check"></i><b>12.3.4</b> Risk-adjusted measures</a></li>
<li class="chapter" data-level="12.3.5" data-path="backtest.html"><a href="backtest.html#transaction-costs-and-turnover"><i class="fa fa-check"></i><b>12.3.5</b> Transaction costs and turnover</a></li>
</ul></li>
<li class="chapter" data-level="12.4" data-path="backtest.html"><a href="backtest.html#common-errors-and-issues"><i class="fa fa-check"></i><b>12.4</b> Common errors and issues</a><ul>
<li class="chapter" data-level="12.4.1" data-path="backtest.html"><a href="backtest.html#forward-looking-data"><i class="fa fa-check"></i><b>12.4.1</b> Forward looking data</a></li>
<li class="chapter" data-level="12.4.2" data-path="backtest.html"><a href="backtest.html#backov"><i class="fa fa-check"></i><b>12.4.2</b> Backtest overfitting</a></li>
<li class="chapter" data-level="12.4.3" data-path="backtest.html"><a href="backtest.html#simple-safeguards"><i class="fa fa-check"></i><b>12.4.3</b> Simple safeguards</a></li>
</ul></li>
<li class="chapter" data-level="12.5" data-path="backtest.html"><a href="backtest.html#implication-of-non-stationarity-forecasting-is-hard"><i class="fa fa-check"></i><b>12.5</b> Implication of non-stationarity: forecasting is hard</a><ul>
<li class="chapter" data-level="12.5.1" data-path="backtest.html"><a href="backtest.html#general-comments"><i class="fa fa-check"></i><b>12.5.1</b> General comments</a></li>
<li class="chapter" data-level="12.5.2" data-path="backtest.html"><a href="backtest.html#the-no-free-lunch-theorem"><i class="fa fa-check"></i><b>12.5.2</b> The no free lunch theorem</a></li>
</ul></li>
<li class="chapter" data-level="12.6" data-path="backtest.html"><a href="backtest.html#first-example-a-complete-backtest"><i class="fa fa-check"></i><b>12.6</b> First example: a complete backtest</a></li>
<li class="chapter" data-level="12.7" data-path="backtest.html"><a href="backtest.html#second-example-backtest-overfitting"><i class="fa fa-check"></i><b>12.7</b> Second example: backtest overfitting</a></li>
<li class="chapter" data-level="12.8" data-path="backtest.html"><a href="backtest.html#coding-exercises-4"><i class="fa fa-check"></i><b>12.8</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>IV Further important topics</b></span></li>
<li class="chapter" data-level="13" data-path="interp.html"><a href="interp.html"><i class="fa fa-check"></i><b>13</b> Interpretability</a><ul>
<li class="chapter" data-level="13.1" data-path="interp.html"><a href="interp.html#global-interpretations"><i class="fa fa-check"></i><b>13.1</b> Global interpretations</a><ul>
<li class="chapter" data-level="13.1.1" data-path="interp.html"><a href="interp.html#surr"><i class="fa fa-check"></i><b>13.1.1</b> Simple models as surrogates</a></li>
<li class="chapter" data-level="13.1.2" data-path="interp.html"><a href="interp.html#variable-importance"><i class="fa fa-check"></i><b>13.1.2</b> Variable importance (tree-based)</a></li>
<li class="chapter" data-level="13.1.3" data-path="interp.html"><a href="interp.html#variable-importance-agnostic"><i class="fa fa-check"></i><b>13.1.3</b> Variable importance (agnostic)</a></li>
<li class="chapter" data-level="13.1.4" data-path="interp.html"><a href="interp.html#partial-dependence-plot"><i class="fa fa-check"></i><b>13.1.4</b> Partial dependence plot</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="interp.html"><a href="interp.html#local-interpretations"><i class="fa fa-check"></i><b>13.2</b> Local interpretations</a><ul>
<li class="chapter" data-level="13.2.1" data-path="interp.html"><a href="interp.html#lime"><i class="fa fa-check"></i><b>13.2.1</b> LIME</a></li>
<li class="chapter" data-level="13.2.2" data-path="interp.html"><a href="interp.html#shapley-values"><i class="fa fa-check"></i><b>13.2.2</b> Shapley values</a></li>
<li class="chapter" data-level="13.2.3" data-path="interp.html"><a href="interp.html#breakdown"><i class="fa fa-check"></i><b>13.2.3</b> Breakdown</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="14" data-path="causality.html"><a href="causality.html"><i class="fa fa-check"></i><b>14</b> Two key concepts: causality and non-stationarity</a><ul>
<li class="chapter" data-level="14.1" data-path="causality.html"><a href="causality.html#causality-1"><i class="fa fa-check"></i><b>14.1</b> Causality</a><ul>
<li class="chapter" data-level="14.1.1" data-path="causality.html"><a href="causality.html#granger"><i class="fa fa-check"></i><b>14.1.1</b> Granger causality</a></li>
<li class="chapter" data-level="14.1.2" data-path="causality.html"><a href="causality.html#causal-additive-models"><i class="fa fa-check"></i><b>14.1.2</b> Causal additive models</a></li>
<li class="chapter" data-level="14.1.3" data-path="causality.html"><a href="causality.html#structural-time-series-models"><i class="fa fa-check"></i><b>14.1.3</b> Structural time series models</a></li>
</ul></li>
<li class="chapter" data-level="14.2" data-path="causality.html"><a href="causality.html#nonstat"><i class="fa fa-check"></i><b>14.2</b> Dealing with changing environments</a><ul>
<li class="chapter" data-level="14.2.1" data-path="causality.html"><a href="causality.html#non-stationarity-yet-another-illustration"><i class="fa fa-check"></i><b>14.2.1</b> Non-stationarity: yet another illustration</a></li>
<li class="chapter" data-level="14.2.2" data-path="causality.html"><a href="causality.html#online-learning"><i class="fa fa-check"></i><b>14.2.2</b> Online learning</a></li>
<li class="chapter" data-level="14.2.3" data-path="causality.html"><a href="causality.html#homogeneous-transfer-learning"><i class="fa fa-check"></i><b>14.2.3</b> Homogeneous transfer learning</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="15" data-path="unsup.html"><a href="unsup.html"><i class="fa fa-check"></i><b>15</b> Unsupervised learning</a><ul>
<li class="chapter" data-level="15.1" data-path="unsup.html"><a href="unsup.html#corpred"><i class="fa fa-check"></i><b>15.1</b> The problem with correlated predictors</a></li>
<li class="chapter" data-level="15.2" data-path="unsup.html"><a href="unsup.html#principal-component-analysis-and-autoencoders"><i class="fa fa-check"></i><b>15.2</b> Principal component analysis and autoencoders</a><ul>
<li class="chapter" data-level="15.2.1" data-path="unsup.html"><a href="unsup.html#a-bit-of-algebra"><i class="fa fa-check"></i><b>15.2.1</b> A bit of algebra</a></li>
<li class="chapter" data-level="15.2.2" data-path="unsup.html"><a href="unsup.html#pca"><i class="fa fa-check"></i><b>15.2.2</b> PCA</a></li>
<li class="chapter" data-level="15.2.3" data-path="unsup.html"><a href="unsup.html#ae"><i class="fa fa-check"></i><b>15.2.3</b> Autoencoders</a></li>
<li class="chapter" data-level="15.2.4" data-path="unsup.html"><a href="unsup.html#application"><i class="fa fa-check"></i><b>15.2.4</b> Application</a></li>
</ul></li>
<li class="chapter" data-level="15.3" data-path="unsup.html"><a href="unsup.html#clustering-via-k-means"><i class="fa fa-check"></i><b>15.3</b> Clustering via k-means</a></li>
<li class="chapter" data-level="15.4" data-path="unsup.html"><a href="unsup.html#nearest-neighbors"><i class="fa fa-check"></i><b>15.4</b> Nearest neighbors</a></li>
<li class="chapter" data-level="15.5" data-path="unsup.html"><a href="unsup.html#coding-exercise-2"><i class="fa fa-check"></i><b>15.5</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="RL.html"><a href="RL.html"><i class="fa fa-check"></i><b>16</b> Reinforcement learning</a><ul>
<li class="chapter" data-level="16.1" data-path="RL.html"><a href="RL.html#theoretical-layout"><i class="fa fa-check"></i><b>16.1</b> Theoretical layout</a><ul>
<li class="chapter" data-level="16.1.1" data-path="RL.html"><a href="RL.html#general-framework"><i class="fa fa-check"></i><b>16.1.1</b> General framework</a></li>
<li class="chapter" data-level="16.1.2" data-path="RL.html"><a href="RL.html#q-learning"><i class="fa fa-check"></i><b>16.1.2</b> Q-learning</a></li>
<li class="chapter" data-level="16.1.3" data-path="RL.html"><a href="RL.html#sarsa"><i class="fa fa-check"></i><b>16.1.3</b> SARSA</a></li>
</ul></li>
<li class="chapter" data-level="16.2" data-path="RL.html"><a href="RL.html#the-curse-of-dimensionality"><i class="fa fa-check"></i><b>16.2</b> The curse of dimensionality</a></li>
<li class="chapter" data-level="16.3" data-path="RL.html"><a href="RL.html#policy-gradient"><i class="fa fa-check"></i><b>16.3</b> Policy gradient</a><ul>
<li class="chapter" data-level="16.3.1" data-path="RL.html"><a href="RL.html#principle-2"><i class="fa fa-check"></i><b>16.3.1</b> Principle</a></li>
<li class="chapter" data-level="16.3.2" data-path="RL.html"><a href="RL.html#extensions-2"><i class="fa fa-check"></i><b>16.3.2</b> Extensions</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="RL.html"><a href="RL.html#simple-examples"><i class="fa fa-check"></i><b>16.4</b> Simple examples</a><ul>
<li class="chapter" data-level="16.4.1" data-path="RL.html"><a href="RL.html#q-learning-with-simulations"><i class="fa fa-check"></i><b>16.4.1</b> Q-learning with simulations</a></li>
<li class="chapter" data-level="16.4.2" data-path="RL.html"><a href="RL.html#RLemp2"><i class="fa fa-check"></i><b>16.4.2</b> Q-learning with market data</a></li>
</ul></li>
<li class="chapter" data-level="16.5" data-path="RL.html"><a href="RL.html#concluding-remarks"><i class="fa fa-check"></i><b>16.5</b> Concluding remarks</a></li>
<li class="chapter" data-level="16.6" data-path="RL.html"><a href="RL.html#exercises"><i class="fa fa-check"></i><b>16.6</b> Exercises</a></li>
</ul></li>
<li class="part"><span><b>V Appendix</b></span></li>
<li class="chapter" data-level="17" data-path="data-description.html"><a href="data-description.html"><i class="fa fa-check"></i><b>17</b> Data description</a></li>
<li class="chapter" data-level="18" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html"><i class="fa fa-check"></i><b>18</b> Solutions to exercises</a><ul>
<li class="chapter" data-level="18.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-3"><i class="fa fa-check"></i><b>18.1</b> Chapter 3</a></li>
<li class="chapter" data-level="18.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-4"><i class="fa fa-check"></i><b>18.2</b> Chapter 4</a></li>
<li class="chapter" data-level="18.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-5"><i class="fa fa-check"></i><b>18.3</b> Chapter 5</a></li>
<li class="chapter" data-level="18.4" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-6"><i class="fa fa-check"></i><b>18.4</b> Chapter 6</a></li>
<li class="chapter" data-level="18.5" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-7-the-autoencoder-model"><i class="fa fa-check"></i><b>18.5</b> Chapter 7: the autoencoder model</a></li>
<li class="chapter" data-level="18.6" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-8"><i class="fa fa-check"></i><b>18.6</b> Chapter 8</a></li>
<li class="chapter" data-level="18.7" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-11-ensemble-neural-network"><i class="fa fa-check"></i><b>18.7</b> Chapter 11: ensemble neural network</a></li>
<li class="chapter" data-level="18.8" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-12"><i class="fa fa-check"></i><b>18.8</b> Chapter 12</a><ul>
<li class="chapter" data-level="18.8.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#ew-portfolios-with-the-tidyverse"><i class="fa fa-check"></i><b>18.8.1</b> EW portfolios with the tidyverse</a></li>
<li class="chapter" data-level="18.8.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#advanced-weighting-function"><i class="fa fa-check"></i><b>18.8.2</b> Advanced weighting function</a></li>
<li class="chapter" data-level="18.8.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#functional-programming-in-the-backtest"><i class="fa fa-check"></i><b>18.8.3</b> Functional programming in the backtest</a></li>
</ul></li>
<li class="chapter" data-level="18.9" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-15"><i class="fa fa-check"></i><b>18.9</b> Chapter 15</a></li>
<li class="chapter" data-level="18.10" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-16"><i class="fa fa-check"></i><b>18.10</b> Chapter 16</a></li>
</ul></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning for Factor Investing</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="unsup" class="section level1">
<h1><span class="header-section-number">Chapter 15</span> Unsupervised learning</h1>
<p>All algorithms presented in Chapters <a href="lasso.html#lasso">5</a> to <a href="bayes.html#bayes">9</a> belong to the larger class of supervised learning tools. Such tools seek to unveil a mapping between predictors <span class="math inline">\(\textbf{X}\)</span> and a label <span class="math inline">\(\textbf{Z}\)</span>. The supervision comes from the fact that it is asked that the data tries to explain this particular variable <span class="math inline">\(\textbf{Z}\)</span>. Another important part of machine learning consists of unsupervised tasks, that is, when <span class="math inline">\(\textbf{Z}\)</span> is not specified and the algorithm tries to make sense of <span class="math inline">\(\textbf{X}\)</span> on its own. Often, relationships between the components of <span class="math inline">\(\textbf{X}\)</span> are identified. This field is much too vast to be summarized in one book, let alone one chapter. The purpose here is to briefly explain in what ways unsupervised learning can be used, especially in the data pre-processing phase.</p>
<div id="corpred" class="section level2">
<h2><span class="header-section-number">15.1</span> The problem with correlated predictors</h2>
<p>Often, it is tempting to supply all predictors to a ML-fueled predictive engine. That may not be a good idea when some predictors are highly correlated. To illustrate this, the simplest example is a regression on two variables with zero mean and covariance and precisions matrices:
<span class="math display">\[\boldsymbol{\Sigma}=\textbf{X}&#39;\textbf{X}=\begin{bmatrix} 1 &amp; \rho \\ \rho &amp; 1 \end{bmatrix},  \quad \boldsymbol{\Sigma}^{-1}=\frac{1}{1-\rho^2}\begin{bmatrix} 1 &amp; -\rho \\ -\rho &amp; 1 \end{bmatrix}.\]</span>
When the covariance/correlation <span class="math inline">\(\rho\)</span> increase towards 1 (the two variables are co-linear), the scaling denominator in <span class="math inline">\(\boldsymbol{\Sigma}^{-1}\)</span> goes to zero and the formula <span class="math inline">\(\hat{\boldsymbol{\beta}}=\boldsymbol{\Sigma}^{-1}\textbf{X}&#39;\textbf{Z}\)</span> implies that one coefficient will be highly positive and one highly negative. The regression creates a spurious arbitrage between the two variables. Of course, this is very inefficient and yields disastrous results out-of-sample.</p>
<p>We illustrate what happens when many variables are used in the regression below (Table <a href="unsup.html#tab:regbroom">15.1</a>). One elucidation of the aforementioned phenomenon comes from the variables Mkt_Cap_12M_Usd and Mkt_Cap_6M_Usd, which have a correlation of 99.6% in the training sample. Both are singled out as highly significant but their signs are contradictory. Moreover, the magnitude of their coefficients are very close (0.21 versus 0.18) so that their net effect cancels out. Naturally, providing the regression with only one of these two inputs would have been wiser.</p>

<div class="sourceCode" id="cb224"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb224-1"><a href="unsup.html#cb224-1"></a><span class="kw">library</span>(broom)                                  <span class="co"># Package for clean regression output </span></span>
<span id="cb224-2"><a href="unsup.html#cb224-2"></a>training_sample <span class="op">%&gt;%</span><span class="st">    </span></span>
<span id="cb224-3"><a href="unsup.html#cb224-3"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(<span class="kw">c</span>(features,  <span class="st">&quot;R1M_Usd&quot;</span>)) <span class="op">%&gt;%</span><span class="st">  </span><span class="co"># List of variables</span></span>
<span id="cb224-4"><a href="unsup.html#cb224-4"></a><span class="st">    </span><span class="kw">lm</span>(R1M_Usd <span class="op">~</span><span class="st"> </span>. , <span class="dt">data =</span> .) <span class="op">%&gt;%</span><span class="st">              </span><span class="co"># Model: predict R1M_Usd</span></span>
<span id="cb224-5"><a href="unsup.html#cb224-5"></a><span class="st">    </span><span class="kw">tidy</span>() <span class="op">%&gt;%</span><span class="st">                                  </span><span class="co"># Put output in clean format</span></span>
<span id="cb224-6"><a href="unsup.html#cb224-6"></a><span class="st">    </span><span class="kw">filter</span>(<span class="kw">abs</span>(statistic) <span class="op">&gt;</span><span class="st"> </span><span class="dv">3</span>)  <span class="op">%&gt;%</span><span class="st">             </span><span class="co"># Keep significant predictors only</span></span>
<span id="cb224-7"><a href="unsup.html#cb224-7"></a><span class="st">    </span>knitr<span class="op">::</span><span class="kw">kable</span>(<span class="dt">booktabs =</span> <span class="ot">TRUE</span>,</span>
<span id="cb224-8"><a href="unsup.html#cb224-8"></a>                 <span class="dt">caption =</span> <span class="st">&quot;Significant predictors in the training sample.&quot;</span>) </span></code></pre></div>
<table>
<caption>
<span id="tab:regbroom">TABLE 15.1: </span>Significant predictors in the training sample.
</caption>
<thead>
<tr>
<th style="text-align:left;">
term
</th>
<th style="text-align:right;">
estimate
</th>
<th style="text-align:right;">
std.error
</th>
<th style="text-align:right;">
statistic
</th>
<th style="text-align:right;">
p.value
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
(Intercept)
</td>
<td style="text-align:right;">
0.0405741
</td>
<td style="text-align:right;">
0.0053427
</td>
<td style="text-align:right;">
7.594323
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Ebitda_Margin
</td>
<td style="text-align:right;">
0.0132374
</td>
<td style="text-align:right;">
0.0034927
</td>
<td style="text-align:right;">
3.789999
</td>
<td style="text-align:right;">
0.0001507
</td>
</tr>
<tr>
<td style="text-align:left;">
Ev_Ebitda
</td>
<td style="text-align:right;">
0.0068144
</td>
<td style="text-align:right;">
0.0022563
</td>
<td style="text-align:right;">
3.020213
</td>
<td style="text-align:right;">
0.0025263
</td>
</tr>
<tr>
<td style="text-align:left;">
Fa_Ci
</td>
<td style="text-align:right;">
0.0072308
</td>
<td style="text-align:right;">
0.0023465
</td>
<td style="text-align:right;">
3.081471
</td>
<td style="text-align:right;">
0.0020601
</td>
</tr>
<tr>
<td style="text-align:left;">
Fcf_Bv
</td>
<td style="text-align:right;">
0.0250538
</td>
<td style="text-align:right;">
0.0051314
</td>
<td style="text-align:right;">
4.882465
</td>
<td style="text-align:right;">
0.0000010
</td>
</tr>
<tr>
<td style="text-align:left;">
Fcf_Yld
</td>
<td style="text-align:right;">
-0.0158930
</td>
<td style="text-align:right;">
0.0037359
</td>
<td style="text-align:right;">
-4.254126
</td>
<td style="text-align:right;">
0.0000210
</td>
</tr>
<tr>
<td style="text-align:left;">
Mkt_Cap_12M_Usd
</td>
<td style="text-align:right;">
0.2047383
</td>
<td style="text-align:right;">
0.0274320
</td>
<td style="text-align:right;">
7.463476
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Mkt_Cap_6M_Usd
</td>
<td style="text-align:right;">
-0.1797795
</td>
<td style="text-align:right;">
0.0459390
</td>
<td style="text-align:right;">
-3.913443
</td>
<td style="text-align:right;">
0.0000910
</td>
</tr>
<tr>
<td style="text-align:left;">
Mom_5M_Usd
</td>
<td style="text-align:right;">
-0.0186690
</td>
<td style="text-align:right;">
0.0044313
</td>
<td style="text-align:right;">
-4.212972
</td>
<td style="text-align:right;">
0.0000252
</td>
</tr>
<tr>
<td style="text-align:left;">
Mom_Sharp_11M_Usd
</td>
<td style="text-align:right;">
0.0178174
</td>
<td style="text-align:right;">
0.0046948
</td>
<td style="text-align:right;">
3.795131
</td>
<td style="text-align:right;">
0.0001476
</td>
</tr>
<tr>
<td style="text-align:left;">
Ni
</td>
<td style="text-align:right;">
0.0154609
</td>
<td style="text-align:right;">
0.0044966
</td>
<td style="text-align:right;">
3.438361
</td>
<td style="text-align:right;">
0.0005854
</td>
</tr>
<tr>
<td style="text-align:left;">
Ni_Avail_Margin
</td>
<td style="text-align:right;">
0.0118135
</td>
<td style="text-align:right;">
0.0038614
</td>
<td style="text-align:right;">
3.059359
</td>
<td style="text-align:right;">
0.0022184
</td>
</tr>
<tr>
<td style="text-align:left;">
Ocf_Bv
</td>
<td style="text-align:right;">
-0.0198113
</td>
<td style="text-align:right;">
0.0052939
</td>
<td style="text-align:right;">
-3.742277
</td>
<td style="text-align:right;">
0.0001824
</td>
</tr>
<tr>
<td style="text-align:left;">
Pb
</td>
<td style="text-align:right;">
-0.0178971
</td>
<td style="text-align:right;">
0.0031285
</td>
<td style="text-align:right;">
-5.720637
</td>
<td style="text-align:right;">
0.0000000
</td>
</tr>
<tr>
<td style="text-align:left;">
Pe
</td>
<td style="text-align:right;">
-0.0089908
</td>
<td style="text-align:right;">
0.0023539
</td>
<td style="text-align:right;">
-3.819565
</td>
<td style="text-align:right;">
0.0001337
</td>
</tr>
<tr>
<td style="text-align:left;">
Sales_Ps
</td>
<td style="text-align:right;">
-0.0157856
</td>
<td style="text-align:right;">
0.0046278
</td>
<td style="text-align:right;">
-3.411062
</td>
<td style="text-align:right;">
0.0006472
</td>
</tr>
<tr>
<td style="text-align:left;">
Vol1Y_Usd
</td>
<td style="text-align:right;">
0.0114250
</td>
<td style="text-align:right;">
0.0027923
</td>
<td style="text-align:right;">
4.091628
</td>
<td style="text-align:right;">
0.0000429
</td>
</tr>
<tr>
<td style="text-align:left;">
Vol3Y_Usd
</td>
<td style="text-align:right;">
0.0084587
</td>
<td style="text-align:right;">
0.0027952
</td>
<td style="text-align:right;">
3.026169
</td>
<td style="text-align:right;">
0.0024771
</td>
</tr>
</tbody>
</table>

<p>In fact, there are several indicators for the market capitalization and maybe only one would suffice, but it is not obvious to tell which one is the best choice.</p>
<p>To further depict correlation issues, we compute the correlation matrix of the predictors below (on the training sample). Because of its dimension, we show it graphically. As there are too many labels, we remove them.</p>
<div class="sourceCode" id="cb225"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb225-1"><a href="unsup.html#cb225-1"></a><span class="kw">library</span>(corrplot)              <span class="co"># Package for plots of correlation matrices</span></span>
<span id="cb225-2"><a href="unsup.html#cb225-2"></a>C &lt;-<span class="st"> </span><span class="kw">cor</span>(training_sample <span class="op">%&gt;%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features)) <span class="co"># Correlation matrix</span></span>
<span id="cb225-3"><a href="unsup.html#cb225-3"></a><span class="kw">corrplot</span>(C, <span class="dt">tl.pos=</span><span class="st">&#39;n&#39;</span>)        <span class="co"># Plot</span></span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:instcorrplot2"></span>
<img src="ML_factor_files/figure-html/instcorrplot2-1.png" alt="Correlation matrix of predictors." width="480" />
<p class="caption">
FIGURE 15.1: Correlation matrix of predictors.
</p>
</div>
<p>The graph of Figure <a href="unsup.html#fig:instcorrplot2">15.1</a> reveals several blue squares around the diagonal. For instance, the biggest square around the first third of features relates to all accounting ratios based on free cash flows. Because of this common term in their calculation, the features are naturally highly correlated. These local correlation patterns occur several times in the dataset and explain why it is not a good idea to use simple regression with this set of features.</p>
<p>In full disclosure, <strong>multicollinearity</strong> (when predictors are correlated) can be much less a problem for ML tools than it is for pure statistical inference. In statistics, one central goal is to study the properties of <span class="math inline">\(\beta\)</span> coefficients. Collinearity perturbs this kind of analysis. In machine learning, the aim is to maximize out-of-sample accuracy. If having many predictors can be helpful, then so be it. One simple example can help clarify this matter. When building a regression tree, having many predictors will give more options for the splits. If the features make sense, then they can be useful. The same reasoning applies to random forests and boosted trees. What does matter is that the large spectrum of features helps improve the generalization ability of the model. Their collinearity is irrelevant.</p>
<p>In the remainder of the chapter, we present two approaches that help reduce the number of predictors:</p>
<ul>
<li>the first one aims at creating new variables that are uncorrelated with each other. Low correlation is favorable from an algorithmic point of view, but the new variables lack interpretability;<br />
</li>
<li>the second one gathers predictors into homogeneous clusters and only one feature should be chosen out of this cluster. Here the rationale is reversed: interpretability is favored over statistical properties because the resulting set of features may still include high correlations, albeit to a lesser point compared to the original one.</li>
</ul>
</div>
<div id="principal-component-analysis-and-autoencoders" class="section level2">
<h2><span class="header-section-number">15.2</span> Principal component analysis and autoencoders</h2>
<p>The first method is a cornerstone in dimensionality reduction. It seeks to determine a smaller number of factors (<span class="math inline">\(K&#39;&lt;K\)</span>) such that:<br />
- i) the level of explanatory power remains as high as possible;<br />
- ii) the resulting factors are linear combinations of the original variables;<br />
- iii) the resulting factors are orthogonal.</p>
<div id="a-bit-of-algebra" class="section level3">
<h3><span class="header-section-number">15.2.1</span> A bit of algebra</h3>
<p> 
In this short subsection, we define some key concepts that are required to fully understand the derivation of principal component analysis (PCA). Henceforth, we work with matrices (in bold fonts). An <span class="math inline">\(I \times K\)</span> matrix <span class="math inline">\(\textbf{X}\)</span> is orthonormal if <span class="math inline">\(I&gt; K\)</span> and <span class="math inline">\(\textbf{X}&#39;\textbf{X}=\textbf{I}_K\)</span>. When <span class="math inline">\(I=K\)</span>, the (square) matrix is called orthogonal and <span class="math inline">\(\textbf{X}&#39;\textbf{X}=\textbf{X}\textbf{X}&#39;=\textbf{I}_K\)</span>, i.e., <span class="math inline">\(\textbf{X}^{-1}=\textbf{X}&#39;\)</span>.</p>
<p>One foundational result in matrix theory is the Singular Value Decomposition (SVD, see, e.g., chapter 5 in <span class="citation">Meyer (<a href="#ref-meyer2000matrix" role="doc-biblioref">2000</a>)</span>). The SVD is formulated as follows: any <span class="math inline">\(I \times K\)</span> matrix <span class="math inline">\(\textbf{X}\)</span> can be decomposed into
<span class="math display" id="eq:svd">\[\begin{equation}
\tag{15.1}
\textbf{X}=\textbf{U} \boldsymbol{\Delta} \textbf{V}&#39;,
\end{equation}\]</span>
where <span class="math inline">\(\textbf{U}\)</span> (<span class="math inline">\(I\times I\)</span>) and <span class="math inline">\(\textbf{V}\)</span> (<span class="math inline">\(K \times K\)</span>) are orthogonal and <span class="math inline">\(\boldsymbol{\Delta}\)</span> (with dimensions <span class="math inline">\(I\times K\)</span>) is diagonal, i.e., <span class="math inline">\(\Delta_{i,k}=0\)</span> whenever <span class="math inline">\(i\neq k\)</span>. In addition, <span class="math inline">\(\Delta{i,i}\ge 0\)</span>: the diagonal terms of <span class="math inline">\(\boldsymbol{\Delta}\)</span> are nonnegative.</p>
<p>For simplicity, we assume below that <span class="math inline">\(\textbf{1}_I&#39;\textbf{X}=\textbf{0}_K&#39;\)</span>, i.e., that all columns have zero sum (and hence zero mean).<a href="#fn32" class="footnote-ref" id="fnref32"><sup>32</sup></a> This allows to write that the covariance matrix is equal to its sample estimate <span class="math inline">\(\boldsymbol{\Sigma}_X= \frac{1}{I-1}\textbf{X}&#39;\textbf{X}\)</span>.</p>
<p>One crucial feature of covariance matrices is their symmetry. Indeed, real-valued symmetric (square) matrices enjoy a SVD which is much more powerful: when <span class="math inline">\(\textbf{X}\)</span> is symmetric, there exist an orthogonal matrix <span class="math inline">\(\textbf{Q}\)</span> and a diagonal matrix <span class="math inline">\(\textbf{D}\)</span> such that
<span class="math display" id="eq:diagonaliz">\[\begin{equation}
\tag{15.2} 
\textbf{X}=\textbf{Q}\textbf{DQ}&#39;.
\end{equation}\]</span>
This process is called <strong>diagonalization</strong> (see chapter 7 in <span class="citation">Meyer (<a href="#ref-meyer2000matrix" role="doc-biblioref">2000</a>)</span>) and conveniently applies to covariance matrices.</p>
</div>
<div id="pca" class="section level3">
<h3><span class="header-section-number">15.2.2</span> PCA</h3>
<p> 
The goal of PCA is to build a dataset <span class="math inline">\(\tilde{\textbf{X}}\)</span> that has fewer columns but that keeps as much information as possible when compressing the original one, <span class="math inline">\(\textbf{X}\)</span>. The key notion is the <strong>change of base</strong>, which is a linear transformation of <span class="math inline">\(\textbf{X}\)</span> into <span class="math inline">\(\textbf{Z}\)</span>, a matrix with identical dimension, via
<span class="math display" id="eq:pca">\[\begin{equation}
\tag{15.3}
\textbf{Z}=\textbf{XP},
\end{equation}\]</span>
where <span class="math inline">\(\textbf{P}\)</span> is a <span class="math inline">\(K \times K\)</span> matrix. There are of course an infinite number of ways to transform <span class="math inline">\(\textbf{X}\)</span> into <span class="math inline">\(\textbf{Z}\)</span>, but two fundamental constraints help reduce the possibilities. The first constraint is that the columns of <span class="math inline">\(\textbf{Z}\)</span> be uncorrelated. Having uncorrelated features is desirable because they then all tell different stories and have zero redundancy. The second constraint is that the variance of the columns of <span class="math inline">\(\textbf{Z}\)</span> is highly concentrated. This means that a few factors (columns) will capture most of the explanatory power (signal), while most (the others) will consist predominantly of noise. All of this is coded in the covariance matrix of <span class="math inline">\(\textbf{Y}\)</span>:</p>
<ul>
<li>the first condition imposes that the covariance matrix be diagonal;<br />
</li>
<li>the second condition imposes that the diagonal elements, when ranked in decreasing magnitude, see their value decline (sharply if possible).</li>
</ul>
<p>The covariance matrix of <span class="math inline">\(\textbf{Z}\)</span> is
<span class="math display" id="eq:covy">\[\begin{equation}
\tag{15.4} 
\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{Z}&#39;\textbf{Z}=\frac{1}{I-1}\textbf{P}&#39;\textbf{X}&#39;\textbf{XP}=\frac{1}{I-1}\textbf{P}&#39;\boldsymbol{\Sigma}_X\textbf{P}.
\end{equation}\]</span></p>
<p>In this expression, we plug the decomposition <a href="unsup.html#eq:diagonaliz">(15.2)</a> of <span class="math inline">\(\boldsymbol{\Sigma}_X\)</span>:
<span class="math display">\[\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{P}&#39;\textbf{Q}\textbf{DQ}&#39;\textbf{P},\]</span>
thus picking <span class="math inline">\(\textbf{P}=\textbf{Q}\)</span>, we get, by orthogonality, <span class="math inline">\(\boldsymbol{\Sigma}_Y=\frac{1}{I-1}\textbf{D}\)</span>, that is, a diagonal covariance matrix for <span class="math inline">\(\textbf{Z}\)</span>. The columns of <span class="math inline">\(\textbf{Z}\)</span> can then be re-shuffled in decreasing order of variance so that the diagonal elements of <span class="math inline">\(\boldsymbol{\Sigma}_Y\)</span> progressively shrink. This is useful because it helps locate the factors with most informational content (the first factors). In the limit, a constant vector (with zero variance) carries no signal.</p>
<p>The matrix <span class="math inline">\(\textbf{Z}\)</span> is a linear transformation of <span class="math inline">\(\textbf{X}\)</span>, thus, it is expected to carry the same information, even though this information is coded differently. Since the columns are ordered according to their relative importance, it is simple to omit some of them. The new set of features <span class="math inline">\(\tilde{\textbf{X}}\)</span> consists in the first <span class="math inline">\(K&#39;\)</span> (with <span class="math inline">\(K&#39;&lt;K\)</span>) columns of <span class="math inline">\(\textbf{Z}\)</span>.</p>
<p>Below, we show how to perform PCA and visualize the output with the <em>factoextra</em> package. To ease readability, we use the smaller sample with few predictors.</p>

<div class="sourceCode" id="cb226"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb226-1"><a href="unsup.html#cb226-1"></a>pca &lt;-<span class="st"> </span>training_sample <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb226-2"><a href="unsup.html#cb226-2"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st">    </span><span class="co"># Smaller number of predictors</span></span>
<span id="cb226-3"><a href="unsup.html#cb226-3"></a><span class="st">    </span><span class="kw">prcomp</span>()                             <span class="co"># Performs PCA</span></span>
<span id="cb226-4"><a href="unsup.html#cb226-4"></a>pca                                      <span class="co"># Show the result</span></span></code></pre></div>
<pre><code>## Standard deviations (1, .., p=7):
## [1] 0.4536601 0.3344080 0.2994393 0.2452000 0.2352087 0.2010782 0.1140988
## 
## Rotation (n x k) = (7 x 7):
##                         PC1         PC2         PC3         PC4         PC5          PC6
## Div_Yld          0.27159946 -0.57909866  0.04572501 -0.52895604 -0.22662581 -0.506566090
## Eps              0.42040708 -0.15008243 -0.02476659  0.33737265  0.77137719 -0.301883295
## Mkt_Cap_12M_Usd  0.52386846  0.34323935  0.17228893  0.06249528 -0.25278113 -0.002987057
## Mom_11M_Usd      0.04723846  0.05771359 -0.89715955  0.24101481 -0.25055884 -0.258476580
## Ocf              0.53294744  0.19588990  0.18503939  0.23437100 -0.35759553 -0.049015486
## Pb               0.15241340  0.58080620 -0.22104807 -0.68213576  0.30866476 -0.038674594
## Vol1Y_Usd       -0.40688963  0.38113933  0.28216181  0.15541056 -0.06157461 -0.762587677
##                          PC7
## Div_Yld          0.032011635
## Eps              0.011965041
## Mkt_Cap_12M_Usd  0.714319417
## Mom_11M_Usd      0.043178747
## Ocf             -0.676866120
## Pb              -0.168799297
## Vol1Y_Usd        0.008632062</code></pre>

<p>The rotation gives the matrix <span class="math inline">\(\textbf{P}\)</span>: it’s the tool that changes the base. The first row of the output indicates the standard deviation of each new factor (column). Each factor is indicated via a PC index (principal component). Often, the first PC (first column PC1 in the output) loads positively on all initial features: a convex weighted average of all predictors is expected to carry a lot of information. In the above example, it is almost the case, with the exception of volatility, which has a negative coefficient in the first PC. The second PC is an arbitrage between price-to-book (long) and dividend yield (short). The third PC is contrarian, as it loads heavily and negatively on momentum. Not all principal components are easy to interpret.</p>
<p>Sometimes, it can be useful to visualize the way the principal components are built. In Figure <a href="unsup.html#fig:pca2">15.2</a>, we show one popular representation that is used for two factors (usually the first two).  </p>

<div class="sourceCode" id="cb228"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb228-1"><a href="unsup.html#cb228-1"></a><span class="kw">library</span>(factoextra)                      <span class="co"># Package for PCA visualization</span></span>
<span id="cb228-2"><a href="unsup.html#cb228-2"></a><span class="kw">fviz_pca_var</span>(pca,                        <span class="co"># Source of PCA decomposition</span></span>
<span id="cb228-3"><a href="unsup.html#cb228-3"></a>             <span class="dt">col.var=</span><span class="st">&quot;contrib&quot;</span>,          </span>
<span id="cb228-4"><a href="unsup.html#cb228-4"></a>             <span class="dt">gradient.cols =</span> <span class="kw">c</span>(<span class="st">&quot;#00AFBB&quot;</span>, <span class="st">&quot;#E7B800&quot;</span>, <span class="st">&quot;#FC4E07&quot;</span>),</span>
<span id="cb228-5"><a href="unsup.html#cb228-5"></a>             <span class="dt">repel =</span> <span class="ot">TRUE</span>                <span class="co"># Avoid text overlapping</span></span>
<span id="cb228-6"><a href="unsup.html#cb228-6"></a>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:pca2"></span>
<img src="ML_factor_files/figure-html/pca2-1.png" alt="Visual representation of PCA with two dimensions." width="330px" height="200px" />
<p class="caption">
FIGURE 15.2: Visual representation of PCA with two dimensions.
</p>
</div>

<p>The plot shows that no initial factor has negative signs for the first two principal components. Volatility is negative for the first one and earnings per share and dividend yield are negative for the second. The numbers indicated along the axes are the proportion of explained variance of each PC. Compared to the figures in the first line of the output, the numbers are squared and then divided by the total sum of squares.</p>
<p>Once the rotation is known, it is possible to select a subsample of the transformed data. From the original 7 features, it is easy to pick just 4.</p>

<div class="sourceCode" id="cb229"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb229-1"><a href="unsup.html#cb229-1"></a>training_sample <span class="op">%&gt;%</span><span class="st">                                  </span><span class="co"># Start from large sample</span></span>
<span id="cb229-2"><a href="unsup.html#cb229-2"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st">                </span><span class="co"># Keep only 7 features</span></span>
<span id="cb229-3"><a href="unsup.html#cb229-3"></a><span class="st">    </span><span class="kw">as.matrix</span>() <span class="op">%&gt;%</span><span class="st">                                  </span><span class="co"># Transform in matrix</span></span>
<span id="cb229-4"><a href="unsup.html#cb229-4"></a><span class="st">    </span><span class="kw">multiply_by_matrix</span>(pca<span class="op">$</span>rotation[,<span class="dv">1</span><span class="op">:</span><span class="dv">4</span>]) <span class="op">%&gt;%</span><span class="st">       </span><span class="co"># Rotate via PCA (first 4 columns of P)</span></span>
<span id="cb229-5"><a href="unsup.html#cb229-5"></a><span class="st">    `</span><span class="dt">colnames&lt;-</span><span class="st">`</span>(<span class="kw">c</span>(<span class="st">&quot;PC1&quot;</span>, <span class="st">&quot;PC2&quot;</span>, <span class="st">&quot;PC3&quot;</span>, <span class="st">&quot;PC4&quot;</span>)) <span class="op">%&gt;%</span><span class="st">  </span><span class="co"># Change column names</span></span>
<span id="cb229-6"><a href="unsup.html#cb229-6"></a><span class="st">    </span><span class="kw">head</span>()                                           <span class="co"># Show first 6 lines</span></span></code></pre></div>
<pre><code>##            PC1       PC2         PC3       PC4
## [1,] 0.3989674 0.7578132 -0.13915223 0.3132578
## [2,] 0.4284697 0.7587274 -0.40164338 0.3745255
## [3,] 0.5215295 0.5679119 -0.10533870 0.2574949
## [4,] 0.5445359 0.5335619 -0.08833864 0.2281793
## [5,] 0.5672644 0.5339749 -0.06092424 0.2320938
## [6,] 0.5871306 0.6420126 -0.44566482 0.3075399</code></pre>

<p>These 4 factors can then be used as orthogonal features in any ML engine. The fact that the features are uncorrelated is undoubtedly an asset. But the price of this convenience is high: the features are no longer immediately interpretable. De-correlating the predictors adds yet another layer of “<em>blackbox-ing</em>” in the algorithm.  </p>
<p>PCA can also be used to estimate factor models. In Equation <a href="unsup.html#eq:pca">(15.3)</a>, it suffices to replace <span class="math inline">\(\textbf{Z}\)</span> with returns, <span class="math inline">\(\textbf{X}\)</span> with factor values and <span class="math inline">\(\textbf{P}\)</span> with factor loadings (see, e.g., <span class="citation">Connor and Korajczyk (<a href="#ref-connor1988risk" role="doc-biblioref">1988</a>)</span> for an early reference). More recently, <span class="citation">Lettau and Pelger (<a href="#ref-lettau2018estimating" role="doc-biblioref">2020</a><a href="#ref-lettau2018estimating" role="doc-biblioref">a</a>)</span> and <span class="citation">Lettau and Pelger (<a href="#ref-lettau2018factors" role="doc-biblioref">2020</a><a href="#ref-lettau2018factors" role="doc-biblioref">b</a>)</span> propose a thorough analysis of PCA estimation techniques. They notably argue that first moments of returns are important and should be included in the objective function, alongside the optimization on the second moments.</p>
<p>We end this subsection with a technical note. Usually, PCA is performed on the covariance matrix of returns. Sometimes, it may be preferable to decompose the <strong>correlation</strong> matrix. The result may adjust substantially if the variables have very different variances (which is not really the case in the equity space). If the investment universe encompasses several asset classes, then a correlation-based PCA will reduce the importance of the most volatile class. In this case, it is as if all returns are scaled by their respective volatilities.</p>
</div>
<div id="ae" class="section level3">
<h3><span class="header-section-number">15.2.3</span> Autoencoders</h3>
<p></p>
<p>In a PCA, the coding from <span class="math inline">\(\textbf{X}\)</span> to <span class="math inline">\(\textbf{Z}\)</span> is straightfoward, linear and works both ways:
<span class="math display">\[\textbf{Z}=\textbf{X}\textbf{P} \quad \text{and} \quad \textbf{X}=\textbf{YP}&#39;,\]</span>
so that we recover <span class="math inline">\(\textbf{X}\)</span> from <span class="math inline">\(\textbf{Z}\)</span>. This can be writen differently:
<span class="math display" id="eq:pcascheme">\[\begin{equation}
\tag{15.5}
\textbf{X} \quad \overset{\text{encode via }\textbf{P}}{\longrightarrow} \quad \textbf{Z} \quad \overset{\text{decode via } \textbf{P}&#39;}{\longrightarrow} \quad \textbf{X}
\end{equation}\]</span></p>
<p>If we take the truncated version and seek a smaller output (with only <span class="math inline">\(K&#39;\)</span> columns), this gives:</p>
<p><span class="math display" id="eq:pcaschem2">\[\begin{equation}
\tag{15.6}
\textbf{X}, \ (I\times K) \quad \overset{\text{encode via }\textbf{P}_{K&#39;}}{\longrightarrow} \quad \tilde{\textbf{X}}, \ (I \times K&#39;) \quad \overset{\text{decode via } \textbf{P}&#39;_{K&#39;}}{\longrightarrow} \quad \breve{\textbf{X}},\ (I \times K),
\end{equation}\]</span></p>
<p>where <span class="math inline">\(\textbf{P}_{K&#39;}\)</span> is the restriction of <span class="math inline">\(\textbf{P}\)</span> to the <span class="math inline">\(K&#39;\)</span> columns that correspond to the factors with the largest variances. The dimensions of matrices are indicated inside the brackets. In this case, the recoding cannot recover <span class="math inline">\(\textbf{P}\)</span> exactly but only an approximation, which we write <span class="math inline">\(\breve{\textbf{X}}\)</span>. This approximation is coded with less information, hence this new data <span class="math inline">\(\breve{\textbf{X}}\)</span> is compressed and provides a parsimonious representation of the original sample <span class="math inline">\(\textbf{X}\)</span>.</p>
<p>An autoencodeur generalizes this concept to <strong>nonlinear</strong> coding functions. Simple linear autoencoders are linked to latent factor models (see Proposition 1 in  for the case of single layer autoencoders.) The scheme is the following
<span class="math display" id="eq:aescheme2">\[\begin{equation}
\tag{15.7}
\textbf{X},\ (I\times K) \quad \overset{\text{encode via } N} {\longrightarrow} \quad \tilde{\textbf{X}}=N(\textbf{X}), \ (I \times K&#39;) \quad \overset{\text{decode via } N&#39;}{\longrightarrow} \quad \breve{\textbf{X}}=N&#39;(\tilde{\textbf{X}}), \ (I \times K),
\end{equation}\]</span></p>
<p>where the encoding and decoding functions <span class="math inline">\(N\)</span> and <span class="math inline">\(N&#39;\)</span> are often taken to be neural networks. The term <strong>autoencoder</strong> comes from the fact that the target output, which we often write <span class="math inline">\(\textbf{Z}\)</span> is the original sample <span class="math inline">\(\textbf{X}\)</span>. Thus, the algorithm seeks to determine the function <span class="math inline">\(N\)</span> that minimizes the distance (to be defined) between <span class="math inline">\(\textbf{X}\)</span> and the output value <span class="math inline">\(\breve{\textbf{X}}\)</span>. The encoder generates an alternative representation of <span class="math inline">\(\textbf{X}\)</span>, whereas the decoder tries to recode it back to its original values. Naturally, the intermediate (coded) version <span class="math inline">\(\tilde{\textbf{X}}\)</span> is targeted to have a smaller dimension compared to <span class="math inline">\(\textbf{X}\)</span>.</p>
</div>
<div id="application" class="section level3">
<h3><span class="header-section-number">15.2.4</span> Application</h3>
<p>
Autoencoders are easy to code in Keras (see Chapter <a href="NN.html#NN">7</a> for more details on Keras). To underline the power of the framework, we resort to another way of coding a NN: the so-called functional API. For simplicity, we work with the small number of predictors (7). The structure of the network consists of two symmetric networks with only one intermediate layer containing 32 units. The activation function is sigmoid; this makes sense since the input has values in the unit interval.</p>

<div class="sourceCode" id="cb231"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb231-1"><a href="unsup.html#cb231-1"></a>input_layer &lt;-<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="kw">c</span>(<span class="dv">7</span>))    <span class="co"># features_short has 7 columns </span></span>
<span id="cb231-2"><a href="unsup.html#cb231-2"></a></span>
<span id="cb231-3"><a href="unsup.html#cb231-3"></a>encoder &lt;-<span class="st"> </span>input_layer <span class="op">%&gt;%</span><span class="st">       </span><span class="co"># First, encode</span></span>
<span id="cb231-4"><a href="unsup.html#cb231-4"></a><span class="st">    </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">32</span>, <span class="dt">activation =</span> <span class="st">&quot;sigmoid&quot;</span>) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb231-5"><a href="unsup.html#cb231-5"></a><span class="st">    </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">4</span>)       <span class="co"># 4 dimensions for the output layer (same as PCA example)</span></span>
<span id="cb231-6"><a href="unsup.html#cb231-6"></a></span>
<span id="cb231-7"><a href="unsup.html#cb231-7"></a>decoder &lt;-<span class="st"> </span>encoder <span class="op">%&gt;%</span><span class="st">           </span><span class="co"># Then, from encoder, decode</span></span>
<span id="cb231-8"><a href="unsup.html#cb231-8"></a><span class="st">    </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">32</span>, <span class="dt">activation =</span> <span class="st">&quot;sigmoid&quot;</span>) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb231-9"><a href="unsup.html#cb231-9"></a><span class="st">    </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">7</span>)       <span class="co"># the original sample has 7 features</span></span></code></pre></div>

<p>In the training part, we optimize the MSE and use an Adam update of the weights (see Section <a href="NN.html#backprop">7.2.3</a>).</p>

<div class="sourceCode" id="cb232"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb232-1"><a href="unsup.html#cb232-1"></a>ae_model &lt;-<span class="st"> </span><span class="kw">keras_model</span>(<span class="dt">inputs =</span> input_layer, <span class="dt">outputs =</span> decoder) <span class="co"># Builds the model</span></span>
<span id="cb232-2"><a href="unsup.html#cb232-2"></a></span>
<span id="cb232-3"><a href="unsup.html#cb232-3"></a>ae_model <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">compile</span>(                <span class="co"># Learning parameters</span></span>
<span id="cb232-4"><a href="unsup.html#cb232-4"></a>    <span class="dt">loss =</span> <span class="st">&#39;mean_squared_error&#39;</span>,</span>
<span id="cb232-5"><a href="unsup.html#cb232-5"></a>    <span class="dt">optimizer =</span> <span class="st">&#39;adam&#39;</span>,</span>
<span id="cb232-6"><a href="unsup.html#cb232-6"></a>    <span class="dt">metrics =</span> <span class="kw">c</span>(<span class="st">&#39;mean_absolute_error&#39;</span>)</span>
<span id="cb232-7"><a href="unsup.html#cb232-7"></a>)</span></code></pre></div>

<p>Finally, we are ready to train the data onto itself! The evolution of loss on the training and testing samples is depicted in Figure <a href="unsup.html#fig:aekeras3">15.3</a>. The decreasing pattern shows the progress of the quality in compression.</p>

<div class="sourceCode" id="cb233"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb233-1"><a href="unsup.html#cb233-1"></a>fit_ae &lt;-<span class="st"> </span>ae_model <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb233-2"><a href="unsup.html#cb233-2"></a><span class="st">    </span><span class="kw">fit</span>(training_sample <span class="op">%&gt;%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>(),  <span class="co"># Input</span></span>
<span id="cb233-3"><a href="unsup.html#cb233-3"></a>        training_sample <span class="op">%&gt;%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>(),  <span class="co"># Output</span></span>
<span id="cb233-4"><a href="unsup.html#cb233-4"></a>        <span class="dt">epochs =</span> <span class="dv">15</span>, <span class="dt">batch_size =</span> <span class="dv">512</span>,</span>
<span id="cb233-5"><a href="unsup.html#cb233-5"></a>        <span class="dt">validation_data =</span> <span class="kw">list</span>(testing_sample <span class="op">%&gt;%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>(), </span>
<span id="cb233-6"><a href="unsup.html#cb233-6"></a>                               testing_sample <span class="op">%&gt;%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>())</span>
<span id="cb233-7"><a href="unsup.html#cb233-7"></a>    )</span>
<span id="cb233-8"><a href="unsup.html#cb233-8"></a><span class="kw">plot</span>(fit_ae) <span class="op">+</span><span class="st"> </span><span class="kw">theme_grey</span>()</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:aekeras3"></span>
<img src="ML_factor_files/figure-html/aekeras3-1.png" alt="Output from the training of an autoencoder." width="400px" />
<p class="caption">
FIGURE 15.3: Output from the training of an autoencoder.
</p>
</div>

<p>In order to get the details of all weights and biases, the syntax is the following.</p>

<div class="sourceCode" id="cb234"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb234-1"><a href="unsup.html#cb234-1"></a>ae_weights &lt;-<span class="st"> </span>ae_model <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">get_weights</span>()</span></code></pre></div>

<p>Retrieving the encoder and processing the data into the compressed format is just a matter of matrix manipulation. In practice, it is possible to build a submodel by loading the weights from the encoder (see exercise below).</p>
</div>
</div>
<div id="clustering-via-k-means" class="section level2">
<h2><span class="header-section-number">15.3</span> Clustering via k-means</h2>
<p> 
The second family of unsupervised tools pertains to clustering. Features are grouped into homogeneous families of predictors. It is then possible to single out one among the group (or to create a synthetic average of all of them). Mechanically, the number of predictors is reduced.</p>
<p>The principle is simple: among a group of variables (the reasoning would be the same for observations in the other dimension) <span class="math inline">\(\textbf{x}_{\{1 \le j \le J\}}\)</span>, find the combination of <span class="math inline">\(k&lt;J\)</span> groups that minimize
<span class="math display" id="eq:kmeans">\[\begin{equation}
\tag{15.8}
\sum_{i=1}^k\sum_{\textbf{x}\in S_i}||\textbf{x}-\textbf{m}_i||^2,
\end{equation}\]</span>
where <span class="math inline">\(||\cdot ||\)</span> is some norm which is usually taken to be the Euclidean <span class="math inline">\(l^2\)</span>-norm. The <span class="math inline">\(S_i\)</span> are the groups and the minimization is run on the whole set of groups <span class="math inline">\(\textbf{S}\)</span>. The <span class="math inline">\(\textbf{m}_i\)</span> are the group means (also called centroids or barycenters): <span class="math inline">\(\textbf{m}_i=(\text{card}(S_i))^{-1}\sum_{\textbf{x}\in S_i}\textbf{x}\)</span>.</p>
<p>In order to ensure optimality, all possible arrangements must be tested, which is prohibitively long when <span class="math inline">\(k\)</span> and <span class="math inline">\(J\)</span> are large. Therefore, the problem is usually solved with greedy algorithms that seek (and find) solutions that are not optimal but ‘good enough’.</p>
<p>One heuristic way to proceed is the following:</p>
<ol start="0" style="list-style-type: decimal">
<li>Start with a (possibly random) partition of <span class="math inline">\(k\)</span> clusters.<br />
</li>
<li>For each cluster, compute the optimal mean values <span class="math inline">\(\textbf{m}_i^*\)</span> that minimizes expression <a href="unsup.html#eq:kmeans">(15.8)</a>. This is a simple quadratic program.<br />
</li>
<li>Given the optimal centers <span class="math inline">\(\textbf{m}_i^*\)</span>, reassign the points <span class="math inline">\(\textbf{x}_i\)</span> so that they are all the closest to their center.<br />
</li>
<li>Repeat steps 1. and 2. until the points do not change cluster at step 2.</li>
</ol>
<p>Below, we illustrate this process with an example. From all 93 features, we build 10 clusters.</p>

<div class="sourceCode" id="cb235"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb235-1"><a href="unsup.html#cb235-1"></a><span class="kw">set.seed</span>(<span class="dv">42</span>)                               <span class="co"># Setting the random seed (the optim. is random)</span></span>
<span id="cb235-2"><a href="unsup.html#cb235-2"></a>k_means &lt;-<span class="st"> </span>training_sample <span class="op">%&gt;%</span><span class="st">             </span><span class="co"># Performs the k-means clustering</span></span>
<span id="cb235-3"><a href="unsup.html#cb235-3"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(features) <span class="op">%&gt;%</span></span>
<span id="cb235-4"><a href="unsup.html#cb235-4"></a><span class="st">    </span><span class="kw">as.matrix</span>() <span class="op">%&gt;%</span></span>
<span id="cb235-5"><a href="unsup.html#cb235-5"></a><span class="st">    </span><span class="kw">t</span>() <span class="op">%&gt;%</span></span>
<span id="cb235-6"><a href="unsup.html#cb235-6"></a><span class="st">    </span><span class="kw">kmeans</span>(<span class="dv">10</span>)</span>
<span id="cb235-7"><a href="unsup.html#cb235-7"></a>clusters &lt;-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">factor =</span> <span class="kw">names</span>(k_means<span class="op">$</span>cluster),   <span class="co"># Organize the cluster data</span></span>
<span id="cb235-8"><a href="unsup.html#cb235-8"></a>                   <span class="dt">cluster =</span> k_means<span class="op">$</span>cluster) <span class="op">%&gt;%</span></span>
<span id="cb235-9"><a href="unsup.html#cb235-9"></a><span class="st">    </span><span class="kw">arrange</span>(cluster)</span>
<span id="cb235-10"><a href="unsup.html#cb235-10"></a>clusters <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">filter</span>(cluster <span class="op">==</span><span class="st"> </span><span class="dv">4</span>)                     <span class="co"># Shows one particular group</span></span></code></pre></div>
<pre><code>## # A tibble: 4 x 2
##   factor                         cluster
##   &lt;chr&gt;                            &lt;int&gt;
## 1 Asset_Turnover                       4
## 2 Bb_Yld                               4
## 3 Recurring_Earning_Total_Assets       4
## 4 Sales_Ps                             4</code></pre>

<p>
We single out the fourth cluster which is composed mainly of accounting ratios related to the profitability of firms. Given these 10 clusters, we can build a much smaller group of features that can then be fed to the predictive engines described in Chapters <a href="lasso.html#lasso">5</a> to <a href="bayes.html#bayes">9</a>. The representative of a cluster can be the member that is closest to the center, or simply the center itself. This pre-processing step can nonetheless cause problems in the forecasting phase. Typically, it requires that the training data be also clustered. The extension to the testing data is not straightforward (the clusters may not be the same).</p>
</div>
<div id="nearest-neighbors" class="section level2">
<h2><span class="header-section-number">15.4</span> Nearest neighbors</h2>
<p>
To the best of our knowledge, nearest neighbors are not used in large-scale portfolio choice applications. The reason is simple: computational cost. Nonetheless, the concept of neighbors is widespread in unsupervised learning and can be used locally in complement to interpretability tools. Theoretical results on k-NN relating to bounds for error rates on classification tasks can be found in section 6.2 of <span class="citation">Ripley (<a href="#ref-ripley2007pattern" role="doc-biblioref">2007</a>)</span>. The rationale is the following. If:</p>
<ol style="list-style-type: decimal">
<li>the training sample is able to accurately span the distribution of <span class="math inline">\((\textbf{y}, \textbf{X})\)</span>; <strong>and</strong><br />
</li>
<li>the testing sample follows the same distribution as the training sample (or close enough);</li>
</ol>
<p>then the neighborhood of one instance <span class="math inline">\(\textbf{x}_i\)</span> from the testing features computed on the training sample will yield valuable information on <span class="math inline">\(y_i\)</span>.</p>
<p>In what follows, we thus seek to find neighbors of one particular instance <span class="math inline">\(\textbf{x}_i\)</span> (a <span class="math inline">\(K\)</span>-dimensional row vector). Note that there is a major difference with the previous section: the clustering is intended at the observation level (row) and not at the predictor level (column).</p>
<p>Given a dataset with the same (corresponding) columns <span class="math inline">\(\textbf{X}_{i,k}\)</span>, the neighbors are defined via a similarity measure (or distance)
<span class="math display" id="eq:D">\[\begin{equation}
\tag{15.9}
D(\textbf{x}_j,\textbf{x}_i)=\sum_{k=1}^Kc_k d_k(x_{j,k},x_{i,k}),
\end{equation}\]</span>
where the distance functions <span class="math inline">\(d_k\)</span> can operate on various data types (numerical, categorical, etc.). For numerical values, <span class="math inline">\(d_k(x_{j,k},x_{i,k})=(x_{j,k}-x_{i,k})^2\)</span> or <span class="math inline">\(d_k(x_{j,k},x_{i,k})=|x_{j,k}-x_{i,k}|\)</span>. For categorical values, we refer to the exhaustive survey by <span class="citation">Boriah, Chandola, and Kumar (<a href="#ref-boriah2008similarity" role="doc-biblioref">2008</a>)</span> which lists 14 possible measures. Finally the <span class="math inline">\(c_k\)</span> in Equation <a href="unsup.html#eq:D">(15.9)</a> allow some flexbility by weighting features. This is useful because both raw values (<span class="math inline">\(x_{i,k}\)</span> versus <span class="math inline">\(x_{i,k&#39;}\)</span>) or measure outputs (<span class="math inline">\(d_k\)</span> versus <span class="math inline">\(d_{k&#39;}\)</span>) can have different scales.</p>
<p>Once the distances are computed over the whole sample, they are ranked using indices <span class="math inline">\(l_1^i, \dots, l_I^i\)</span>:
<span class="math display">\[D\left(\textbf{x}_{l_1^i},\textbf{x}_i\right) \le D\left(\textbf{x}_{l_2^i},\textbf{x}_i\right) \le \dots, \le D\left(\textbf{x}_{l_I^i},\textbf{x}_i\right)\]</span></p>
<p>The nearest neighbors are those indexed by <span class="math inline">\(l_m^i\)</span> for <span class="math inline">\(m=1,\dots,k\)</span>. We leave out the case when there are problematic equalities of the type <span class="math inline">\(D\left(\textbf{x}_{l_m^i},\textbf{x}_i\right)=D\left(\textbf{x}_{l_{m+1}^i},\textbf{x}_i\right)\)</span> for the sake of simplicity and because they rarely occur in practice as long as there are sufficiently many numerical predictors.</p>
<p>Given these neighbors, it is now possible to build a prediction for the label side <span class="math inline">\(y_i\)</span>. The rationale is straightforward: if <span class="math inline">\(\textbf{x}_i\)</span> is close to other instances <span class="math inline">\(\textbf{x}_j\)</span>, then the label value <span class="math inline">\(y_i\)</span> should also be close to <span class="math inline">\(y_j\)</span> (under the assumption that the features carry some predictive information over the label <span class="math inline">\(y\)</span>).</p>
<p>An intuitive prediction for <span class="math inline">\(y_i\)</span> is the following weighted average:
<span class="math display">\[\hat{y}_i=\frac{\sum_{j\neq i} h(D(\textbf{x}_j,\textbf{x}_i)) y_j}{\sum_{j\neq i} h(D(\textbf{x}_j,\textbf{x}_i))},\]</span>
where <span class="math inline">\(h\)</span> is a decreasing function. Thus, the further <span class="math inline">\(\textbf{x}_j\)</span> is from <span class="math inline">\(\textbf{x}_i\)</span>, the smaller the weight in the average. A typical choice for <span class="math inline">\(h\)</span> is <span class="math inline">\(h(z)=e^{-az}\)</span> for some parameter <span class="math inline">\(a&gt;0\)</span> that determines how penalizing the distance <span class="math inline">\(D(\textbf{x}_j,\textbf{x}_i)\)</span> is. Of course, the average can be taken in the set of <span class="math inline">\(k\)</span> nearest neighbors, in which case the <span class="math inline">\(h\)</span> is equal to zero beyond a particular distance threshold:
<span class="math display">\[\hat{y}_i=\frac{\sum_{j \text{ neighbor}} h(D(\textbf{x}_j,\textbf{x}_i)) y_j}{\sum_{j \text{ neighbor}} h(D(\textbf{x}_j,\textbf{x}_i))}.\]</span></p>
<p>A more agnostic rule is to take <span class="math inline">\(h:=1\)</span> over the set of neighbors and in this case, all neighbors have the same weight (see the old discussion by <span class="citation">Bailey and Jain (<a href="#ref-bailey1978note" role="doc-biblioref">1978</a>)</span> in the case of classification). For classification tasks, the procedure involves a voting rule whereby the class with the most votes wins the contest, with possible tie-breaking methods. The interested reader can have a look at the short survey in <span class="citation">Bhatia and others (<a href="#ref-bhatia2010survey" role="doc-biblioref">2010</a>)</span>.</p>
<p>For the choice of optimal <span class="math inline">\(k\)</span>, several complicated techniques and criteria exist (see, e.g., <span class="citation">Ghosh (<a href="#ref-ghosh2006optimum" role="doc-biblioref">2006</a>)</span> and <span class="citation">Hall et al. (<a href="#ref-hall2008choice" role="doc-biblioref">2008</a>)</span>). Heuristic values often do the job pretty well. A rule of thumb is that <span class="math inline">\(k=\sqrt{I}\)</span> (<span class="math inline">\(I\)</span> being the total number of instances) is not too far from the optimal value, unless <span class="math inline">\(I\)</span> is exceedingly large. </p>
<p>Below, we illustrate this concept. We pick one date (31th of December 2006) and single out one asset (with stock_id equal to 13). We then seek to find the <span class="math inline">\(k=30\)</span> stocks that are the closest to this asset at this particular date. We resort to the <em>FNN</em> package that proposes an efficient computation of Euclidean distances (and their ordering).</p>

<div class="sourceCode" id="cb237"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb237-1"><a href="unsup.html#cb237-1"></a><span class="kw">library</span>(FNN)     <span class="co"># Package for Fast Nearest Neighbors detection</span></span>
<span id="cb237-2"><a href="unsup.html#cb237-2"></a>knn_data &lt;-<span class="st"> </span><span class="kw">filter</span>(data_ml, date <span class="op">==</span><span class="st"> &quot;2006-12-31&quot;</span>)    <span class="co"># Dataset for k-NN exercise</span></span>
<span id="cb237-3"><a href="unsup.html#cb237-3"></a>knn_target &lt;-<span class="st"> </span><span class="kw">filter</span>(knn_data, stock_id <span class="op">==</span><span class="st"> </span><span class="dv">13</span>) <span class="op">%&gt;%</span><span class="st">   </span><span class="co"># Target observation</span></span>
<span id="cb237-4"><a href="unsup.html#cb237-4"></a><span class="st">              </span>dplyr<span class="op">::</span><span class="kw">select</span>(features)</span>
<span id="cb237-5"><a href="unsup.html#cb237-5"></a>knn_sample &lt;-<span class="st"> </span><span class="kw">filter</span>(knn_data, stock_id <span class="op">!=</span><span class="st"> </span><span class="dv">13</span>) <span class="op">%&gt;%</span><span class="st">   </span><span class="co"># All other observations</span></span>
<span id="cb237-6"><a href="unsup.html#cb237-6"></a><span class="st">              </span>dplyr<span class="op">::</span><span class="kw">select</span>(features)</span>
<span id="cb237-7"><a href="unsup.html#cb237-7"></a>neighbors &lt;-<span class="st"> </span><span class="kw">get.knnx</span>(<span class="dt">data =</span> knn_sample, <span class="dt">query =</span> knn_target, <span class="dt">k =</span> <span class="dv">30</span>) </span>
<span id="cb237-8"><a href="unsup.html#cb237-8"></a>neighbors<span class="op">$</span>nn.index                                   <span class="co"># Indices of the k nearest neighbors</span></span></code></pre></div>
<pre><code>##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
## [1,]  905  876  730  548 1036  501  335  117  789    54   618   130   342   360   673
##      [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29]
## [1,]   153   265   858   830   286  1150   166   946   192   340   162   951   376   785
##      [,30]
## [1,]     2</code></pre>

<p>Once the neighbors and distances are known, we can compute a prediction for the return of the target stock. We use the function <span class="math inline">\(h(z)=e^{-z}\)</span> for the weighting of instances (via the distances).</p>

<div class="sourceCode" id="cb239"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb239-1"><a href="unsup.html#cb239-1"></a>knn_labels &lt;-<span class="st"> </span>knn_data[<span class="kw">as.vector</span>(neighbors<span class="op">$</span>nn.index),] <span class="op">%&gt;%</span><span class="st">                </span><span class="co"># y values for neighb.</span></span>
<span id="cb239-2"><a href="unsup.html#cb239-2"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(R1M_Usd)    </span>
<span id="cb239-3"><a href="unsup.html#cb239-3"></a><span class="kw">sum</span>(knn_labels <span class="op">*</span><span class="st"> </span><span class="kw">exp</span>(<span class="op">-</span>neighbors<span class="op">$</span>nn.dist) <span class="op">/</span><span class="st"> </span><span class="kw">sum</span>(<span class="kw">exp</span>(<span class="op">-</span>neighbors<span class="op">$</span>nn.dist)))  <span class="co"># Pred w. k(z)=e^(-z)</span></span></code></pre></div>
<pre><code>## [1] 0.003042282</code></pre>
<div class="sourceCode" id="cb241"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb241-1"><a href="unsup.html#cb241-1"></a><span class="kw">filter</span>(knn_data, stock_id <span class="op">==</span><span class="st"> </span><span class="dv">13</span>) <span class="op">%&gt;%</span><span class="st">                                      </span><span class="co"># True y </span></span>
<span id="cb241-2"><a href="unsup.html#cb241-2"></a><span class="st">              </span>dplyr<span class="op">::</span><span class="kw">select</span>(R1M_Usd)</span></code></pre></div>
<pre><code>## # A tibble: 1 x 1
##   R1M_Usd
##     &lt;dbl&gt;
## 1   0.089</code></pre>

<p>The prediction is neither very good, nor very bad (the sign is correct!). However, note that this example cannot be used for predictive purposes because we use data from 2006-12-31 to predict a return at the same date. In order to avoid the forward-looking bias, the knn_sample variable should be chosen from a prior point in time.</p>
<p>The above computations are fast (a handful of seconds at most), but hold for only one asset. In a <span class="math inline">\(k\)</span>-NN exercise, each stock gets a customed prediction and the set of neighbors must be re-assessed each time. For <span class="math inline">\(N\)</span> assets, <span class="math inline">\(N(N-1)/2\)</span> distances must be evaluated. This is particularly costly in a backtest, especially when several parameters can be tested (the number of neighbors, <span class="math inline">\(k\)</span>, or <span class="math inline">\(a\)</span> in the weighting function <span class="math inline">\(h(z)=e^{-az}\)</span>). When the investment universe is small (when trading indices for instance), <em>k</em>-NN methods become computationally attractive (see for instance <span class="citation">Chen and Hao (<a href="#ref-chen2017feature" role="doc-biblioref">2017</a>)</span>).</p>
</div>
<div id="coding-exercise-2" class="section level2">
<h2><span class="header-section-number">15.5</span> Coding exercise</h2>
<p>Code the compressed version of the data (narrow training sample) via the encoder part of the autoencoder.</p>

</div>
</div>
<h3>References</h3>
<div id="refs" class="references">
<div id="ref-bailey1978note">
<p>Bailey, T, and A. K. Jain. 1978. “A Note on Distance-Weighted K-Nearest Neighbor Rules.” <em>IEEE Trans. On Systems, Man, Cybernetics</em> 8 (4): 311–13.</p>
</div>
<div id="ref-bhatia2010survey">
<p>Bhatia, Nitin, and others. 2010. “Survey of Nearest Neighbor Techniques.” <em>arXiv Preprint</em>, no. 1007.0085.</p>
</div>
<div id="ref-boriah2008similarity">
<p>Boriah, Shyam, Varun Chandola, and Vipin Kumar. 2008. “Similarity Measures for Categorical Data: A Comparative Evaluation.” In <em>Proceedings of the 2008 Siam International Conference on Data Mining</em>, 243–54.</p>
</div>
<div id="ref-chen2017feature">
<p>Chen, Yingjun, and Yongtao Hao. 2017. “A Feature Weighted Support Vector Machine and K-Nearest Neighbor Algorithm for Stock Market Indices Prediction.” <em>Expert Systems with Applications</em> 80: 340–55.</p>
</div>
<div id="ref-connor1988risk">
<p>Connor, Gregory, and Robert A Korajczyk. 1988. “Risk and Return in an Equilibrium Apt: Application of a New Test Methodology.” <em>Journal of Financial Economics</em> 21 (2): 255–89.</p>
</div>
<div id="ref-ghosh2006optimum">
<p>Ghosh, Anil K. 2006. “On Optimum Choice of K in Nearest Neighbor Classification.” <em>Computational Statistics &amp; Data Analysis</em> 50 (11): 3113–23.</p>
</div>
<div id="ref-hall2008choice">
<p>Hall, Peter, Byeong U Park, Richard J Samworth, and others. 2008. “Choice of Neighbor Order in Nearest-Neighbor Classification.” <em>Annals of Statistics</em> 36 (5): 2135–52.</p>
</div>
<div id="ref-lettau2018estimating">
<p>Lettau, Martin, and Markus Pelger. 2020a. “Estimating Latent Asset-Pricing Factors.” <em>Journal of Econometrics</em> Forthcoming.</p>
</div>
<div id="ref-lettau2018factors">
<p>Lettau, Martin, and Markus Pelger. 2020b. “Factors That Fit the Time Series and Cross-Section of Stock Returns.” <em>Review of Financial Studies</em> 33 (5): 2274–2325.</p>
</div>
<div id="ref-meyer2000matrix">
<p>Meyer, Carl D. 2000. <em>Matrix Analysis and Applied Linear Algebra</em>. Vol. 71. SIAM.</p>
</div>
<div id="ref-ripley2007pattern">
<p>Ripley, Brian D. 2007. <em>Pattern Recognition and Neural Networks</em>. Cambridge University Press.</p>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="32">
<li id="fn32"><p>In practice, this is not a major problem; since we work with features that are uniformly distributed, de-meaning amounts to remove 0.5 to all feature values.<a href="unsup.html#fnref32" class="footnote-back">↩︎</a></p></li>
</ol>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="causality.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="RL.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": false,
"twitter": true,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": null,
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section",
"scroll_highlight": true
},
"toolbar": {
"position": "fixed",
"download": false
},
"search": true,
"info": true
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:")
      if (/^https?:/.test(src))
        src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>