Reviewd and updated code

gedeck · Nov 8, 2024 · eae1988 · eae1988
1 parent 1cbf3be
commit eae1988
Show file tree

Hide file tree

Showing 20 changed files with 1,093 additions and 2,510 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -45,3 +45,244 @@ ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) +
 stat_binhex(color='white') +
 # theme_bw() +
 scale_fill_gradient(low='white', high='blue')
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+library(lubridate)
+library(ellipse)
+library(mclust)
+library(cluster)
+library(ca)
+PSDS_PATH <- file.path(dirname(dirname(getwd())))
+sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1)
+sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE)
+loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz'))
+loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default'))
+housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1)
+PSDS_PATH
+PSDS_PATH <- file.path(dirname(getwd()))
+PSDS_PATH
+PSDS_PATH <- file.path(getwd())
+sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1)
+sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE)
+loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz'))
+loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default'))
+housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1)
+oil_px <- sp500_px[, c('CVX', 'XOM')]
+pca <- princomp(oil_px)
+pca$loadings
+loadings <- pca$loadings
+graph <- ggplot(data=oil_px, aes(x=CVX, y=XOM)) +
+geom_point(alpha=.3) +
+scale_shape_manual(values=c(46)) +
+stat_ellipse(type='norm', level=.99, color='grey25') +
+geom_abline(intercept = 0, slope = loadings[2,1]/loadings[1,1], color='grey25', linetype=2) +
+geom_abline(intercept = 0, slope = loadings[2,2]/loadings[1,2],  color='grey25', linetype=2) +
+scale_x_continuous(expand=c(0,0)) +
+scale_y_continuous(expand=c(0,0)) +
+coord_cartesian(xlim=c(-3, 3), ylim=c(-3, 3)) +
+theme_bw()
+graph
+syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
+'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
+top_sp <- sp500_px[row.names(sp500_px)>='2011-01-01', syms]
+sp_pca <- princomp(top_sp)
+par(mar=c(6,3,0,0)+.1, las=2)
+screeplot(sp_pca, main='')
+loadings <- sp_pca$loadings[,1:5]
+loadings <- as.data.frame(loadings)
+loadings$Symbol <- row.names(loadings)
+loadings <- gather(loadings, 'Component', 'Weight', -Symbol)
+loadings$Color = loadings$Weight > 0
+graph <- ggplot(loadings, aes(x=Symbol, y=Weight, fill=Color)) +
+geom_bar(stat='identity', position='identity', width=.75) +
+facet_grid(Component ~ ., scales='free_y') +
+guides(fill='none') +
+ylab('Component Loading') +
+theme_bw() +
+theme(axis.title.x=element_blank(),
+axis.text.x=element_text(angle=90, vjust=0.5))
+graph
+ca_analysis <- ca(housetasks)
+plot(ca_analysis)
+if (!require(ggrepel)) {
+install.packages('ggrepel')
+library(ggrepel)
+}
+ca_analysis$sv ** 2
+summary(ca_analysis)
+contrib = ca_analysis$sv ** 2
+contrib = contrib / sum(contrib)
+colcoord = as.data.frame(ca_analysis$colcoord)
+rowcoord = as.data.frame(ca_analysis$rowcoord)
+coords = rbind(
+cbind(rowcoord, type='rowcoord'),
+cbind(colcoord, type='columns')
+)
+row.names(coords) <- gsub('_', ' ', row.names(coords))
+graph <- ggplot(coords, aes(x=Dim1, y=Dim2, color=type, label=rownames(coords), shape=type)) +
+geom_hline(yintercept=0, linetype='dotted', color='#444444') +
+geom_vline(xintercept = 0, linetype='dotted', color='#444444') +
+geom_point() +
+geom_text_repel() +
+xlab(sprintf('Dimension 1 (%.1f%%)', 100 * contrib[1])) +
+ylab(sprintf('Dimension 2 (%.1f%%)', 100 * contrib[2])) +
+scale_color_manual(values = c('blue', 'red')) +
+theme_bw() +
+theme(legend.position = "none")
+graph
+set.seed(1010103)
+df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')]
+km <- kmeans(df, centers=4, nstart=1)
+df$cluster <- factor(km$cluster)
+head(df)
+centers <- data.frame(cluster=factor(1:4), km$centers)
+centers
+graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
+geom_point() +
+scale_shape_manual(values = c(1, 3, 2, 4),
+guide = guide_legend(override.aes=aes(size=1))) +
+geom_point(data=centers,  aes(x=XOM, y=CVX), size=2, stroke=2, color='black')  +
+theme_bw() +
+scale_x_continuous(expand=c(0,0)) +
+scale_y_continuous(expand=c(0,0)) +
+coord_cartesian(xlim=c(-2, 2), ylim=c(-2.5, 2.5))
+graph
+syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
+'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
+df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms]
+set.seed(10010)
+km <- kmeans(df, centers=5, nstart=10)
+km$size
+centers <- as.data.frame(t(km$centers))
+names(centers) <- paste('Cluster', 1:5)
+centers$Symbol <- row.names(centers)
+centers <- gather(centers, 'Cluster', 'Mean', -Symbol)
+centers$Color = centers$Mean > 0
+graph <- ggplot(centers, aes(x=Symbol, y=Mean, fill=Color)) +
+geom_bar(stat='identity', position='identity', width=.75) +
+facet_grid(Cluster ~ ., scales='free_y') +
+guides(fill='none')  +
+ylab('Component Loading') +
+theme_bw() +
+theme(axis.title.x=element_blank(),
+axis.text.x=element_text(angle=90, vjust=0.5))
+graph
+pct_var <- data.frame(pct_var = 0,
+num_clusters=2:14)
+totalss <- kmeans(df, centers=14, nstart=50, iter.max=100)$totss
+for (i in 2:14) {
+kmCluster <- kmeans(df, centers=i, nstart=50, iter.max = 100)
+pct_var[i-1, 'pct_var'] <- kmCluster$betweenss / totalss
+}
+graph <- ggplot(pct_var, aes(x=num_clusters, y=pct_var)) +
+geom_line() +
+geom_point() +
+labs(y='% Variance Explained', x='Number of Clusters') +
+scale_x_continuous(breaks=seq(2, 14, by=2))   +
+theme_bw()
+graph
+syms1 <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX',
+'XOM', 'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP',
+'WMT', 'TGT', 'HD', 'COST')
+df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms1]
+d <- dist(t(df))
+hcl <- hclust(d)
+plot(hcl, ylab='distance', xlab='', sub='', main='')
+cutree(hcl, k=4)
+cluster_fun <- function(df, method)
+{
+d <- dist(df)
+hcl <- hclust(d, method=method)
+tree <- cutree(hcl, k=4)
+df$cluster <- factor(tree)
+df$method <- method
+return(df)
+}
+df0 <- sp500_px[row.names(sp500_px) >= '2011-01-01', c('XOM', 'CVX')]
+df <- rbind(cluster_fun(df0, method='single'),
+cluster_fun(df0, method='average'),
+cluster_fun(df0, method='complete'),
+cluster_fun(df0, method='ward.D'))
+df$method <- ordered(df$method, c('single', 'average', 'complete', 'ward.D'))
+graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
+geom_point(alpha=0.6) +
+scale_shape_manual(values=c(1, 3, 4, 2),
+guide=guide_legend(override.aes=aes(size=2))) +
+facet_wrap( ~ method) +
+theme_bw()
+graph
+mu <- c(.5, -.5)
+sigma <- matrix(c(1, 1, 1, 2), nrow=2)
+prob <- c(.5, .75, .95, .99) ## or whatever you want
+names(prob) <- prob ## to get id column in result
+x <- NULL
+for (p in prob){
+x <- rbind(x,  ellipse(x=sigma, centre=mu, level=p))
+}
+df <- data.frame(x, prob=factor(rep(prob, rep(100, length(prob)))))
+names(df) <- c('X', 'Y', 'Prob')
+## Figure 7-9: Multivariate normal ellipses
+dfmu <- data.frame(X=mu[1], Y=mu[2])
+graph <- ggplot(df, aes(X, Y)) +
+geom_path(aes(linetype=Prob)) +
+geom_point(data=dfmu, aes(X, Y), size=3) +
+theme_bw()
+graph
+df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')]
+mcl <- Mclust(df)
+summary(mcl)
+cluster <- factor(predict(mcl)$classification)
+graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
+geom_point(alpha=.8) +
+theme_bw() +
+scale_shape_manual(values = c(1, 3),
+guide = guide_legend(override.aes=aes(size=2)))
+graph
+summary(mcl, parameters=TRUE)$mean
+summary(mcl, parameters=TRUE)$variance[,,1]
+summary(mcl, parameters=TRUE)$variance[,,2]
+plot(mcl, what='BIC', ask=FALSE, cex=.75)
+defaults <- loan_data[loan_data$outcome=='default',]
+df <- defaults[, c('loan_amnt', 'annual_inc', 'revol_bal', 'open_acc', 'dti', 'revol_util')]
+km <- kmeans(df, centers=4, nstart=10)
+centers <- data.frame(size=km$size, km$centers)
+print(round(centers, digits=2))
+df0 <- scale(df)
+km0 <- kmeans(df0, centers=4, nstart=10)
+centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale'))
+centers0 <- scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE)
+centers0 <- data.frame(size=km0$size, centers0)
+print(round(centers0, digits=2))
+km <- kmeans(df, centers=4, nstart=10)
+centers <- data.frame(size=km$size, km$centers)
+round(centers, digits=2)
+syms <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM',
+'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
+top_15 <- sp500_px[row.names(sp500_px)>='2011-01-01', syms]
+sp_pca1 <- princomp(top_15)
+screeplot(sp_pca1, main='')
+round(sp_pca1$loadings[,1:2], 3)
+x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
+mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
+x
+daisy(x, metric='gower')
+set.seed(301)
+df <- loan_data[sample(nrow(loan_data), 250),
+c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
+mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
+d = daisy(df, metric='gower')
+hcl <- hclust(d)
+dnd <- as.dendrogram(hcl)
+plot(dnd, leaflab='none', ylab='distance')
+dnd_cut <- cut(dnd, h=.5)
+df[labels(dnd_cut$lower[[4]]),]
+df <- model.matrix(~ -1 + dti + payment_inc_ratio + home_ + pub_rec_zero, data=defaults)
+df0 <- scale(df)
+km0 <- kmeans(df0, centers=4, nstart=10)
+centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale'))
+round(scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE), 2)
+df
+x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
+mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
+x
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@ PSFDS_IMAGE=psfds
 JUPYTER_IMAGE=psfds_jupyter
 
 jupyter:
-	docker run --rm -v $(PWD):/src -p 8893:8893 $(JUPYTER_IMAGE) jupyter notebook --allow-root --port=8893 --ip 0.0.0.0 --no-browser
+	docker run --rm -v $(PWD):/src -p 8893:8893 $(JUPYTER_IMAGE) jupyter lab --allow-root --port=8893 --ip 0.0.0.0 --no-browser
 
 bash:
 	docker run -it --rm -v $(PWD):/src $(PSFDS_IMAGE) bash

diff --git a/R/code/Chapter 2 - Data and sampling distributions.R b/R/code/Chapter 2 - Data and sampling distributions.R
@@ -33,24 +33,24 @@ dev.off()
 ## Sampling Distribution of a Statistic
 
 # take a simple random sample
-samp_data <- data.frame(income=sample(loans_income, 1000), 
+samp_data <- data.frame(income=sample(loans_income, 1000),
                         type='data_dist')
 
 # take a sample of means of 5 values
 samp_mean_05 <- data.frame(
-  income = tapply(sample(loans_income, 1000*5), 
+  income = tapply(sample(loans_income, 1000*5),
                   rep(1:1000, rep(5, 1000)), FUN=mean),
   type = 'mean_of_5')
 
 # take a sample of means of 20 values
 samp_mean_20 <- data.frame(
-  income = tapply(sample(loans_income, 1000*20), 
+  income = tapply(sample(loans_income, 1000*20),
                   rep(1:1000, rep(20, 1000)), FUN=mean),
   type = 'mean_of_20')
 
 # bind the data.frames and convert type to a factor
 income <- rbind(samp_data, samp_mean_05, samp_mean_20)
-income$type <- factor(income$type, 
+income$type <- factor(income$type,
                      levels=c('data_dist', 'mean_of_5', 'mean_of_20'),
                      labels=c('Data', 'Mean of 5', 'Mean of 20'))
 
@@ -80,17 +80,16 @@ boot_ci <- boot.ci(boot_obj, conf=0.9, type='basic')
 X <- data.frame(mean=boot_obj$t)
 ci90 <- boot_ci$basic[4:5]
 ci <- data.frame(ci=ci90, y=c(9, 11))
-# ci <- boot_ci$basic[4:5]
 ci
 ggplot(X, aes(x=mean)) +
     geom_histogram(bins=40, fill='#AAAAAA') +
     geom_vline(xintercept=sampleMean, linetype=2) +
     geom_path(aes(x=ci, y=10), data=ci, size=2) +
     geom_path(aes(x=ci90[1], y=y), data=ci, size=2) +
     geom_path(aes(x=ci90[2], y=y), data=ci, size=2) +
-    geom_text(aes(x=sampleMean, y=20, label='Sample mean'), size=6) +
-    geom_text(aes(x=sampleMean, y=8, label='90% interval'), size=6) +
-    theme_bw() + 
+    annotate('text', x=sampleMean, y=20, label='Sample mean', size=6) +
+    annotate('text', x=sampleMean, y=8, label='90% interval', size=6) +
+    theme_bw() +
     labs(x='', y='Counts')
 
 ## Normal Distribution
@@ -127,3 +126,4 @@ rexp(n=100, rate=.2)
 ###  Weibull Distribution
 
 rweibull(100, 1.5, 5000)
+
diff --git a/R/code/Chapter 3 - Statistical Experiments and Significance Testing.R b/R/code/Chapter 3 - Statistical Experiments and Significance Testing.R
@@ -27,9 +27,9 @@ imanishi <-  read.csv(file.path(PSDS_PATH, 'data', 'imanishi_data.csv'))
 ## Resampling
 ### Example: Web Stickiness
 
-graph <- ggplot(session_times, aes(x=Page, y=Time)) + 
+graph <- ggplot(session_times, aes(x=Page, y=Time)) +
   geom_boxplot() +
-  labs(y='Time (in seconds)') + 
+  labs(y='Time (in seconds)') +
   theme_bw()
 graph
 
@@ -87,7 +87,7 @@ t.test(Time ~ Page, data=session_times, alternative='less')
 
 ## ANOVA
 
-graph <- ggplot(four_sessions, aes(x=Page, y=Time)) + 
+graph <- ggplot(four_sessions, aes(x=Page, y=Time)) +
   geom_boxplot() +
   labs(y='Time (in seconds)') +
   theme_bw()
@@ -149,3 +149,4 @@ pwr.2p.test(h=effect_size, sig.level=0.05, power=0.8, alternative='greater')
 
 effect_size = ES.h(p1=0.0165, p2=0.011)
 pwr.2p.test(h=effect_size, sig.level=0.05, power=0.8, alternative='greater')
+