Skip to content

Commit

Permalink
Reviewd and updated code
Browse files Browse the repository at this point in the history
  • Loading branch information
gedeck committed Nov 8, 2024
1 parent 1cbf3be commit eae1988
Show file tree
Hide file tree
Showing 20 changed files with 1,093 additions and 2,510 deletions.
241 changes: 241 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,244 @@ ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) +
stat_binhex(color='white') +
# theme_bw() +
scale_fill_gradient(low='white', high='blue')
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(ellipse)
library(mclust)
library(cluster)
library(ca)
PSDS_PATH <- file.path(dirname(dirname(getwd())))
sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1)
sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE)
loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz'))
loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default'))
housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1)
PSDS_PATH
PSDS_PATH <- file.path(dirname(getwd()))
PSDS_PATH
PSDS_PATH <- file.path(getwd())
sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1)
sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE)
loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz'))
loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default'))
housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1)
oil_px <- sp500_px[, c('CVX', 'XOM')]
pca <- princomp(oil_px)
pca$loadings
loadings <- pca$loadings
graph <- ggplot(data=oil_px, aes(x=CVX, y=XOM)) +
geom_point(alpha=.3) +
scale_shape_manual(values=c(46)) +
stat_ellipse(type='norm', level=.99, color='grey25') +
geom_abline(intercept = 0, slope = loadings[2,1]/loadings[1,1], color='grey25', linetype=2) +
geom_abline(intercept = 0, slope = loadings[2,2]/loadings[1,2], color='grey25', linetype=2) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(-3, 3), ylim=c(-3, 3)) +
theme_bw()
graph
syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
top_sp <- sp500_px[row.names(sp500_px)>='2011-01-01', syms]
sp_pca <- princomp(top_sp)
par(mar=c(6,3,0,0)+.1, las=2)
screeplot(sp_pca, main='')
loadings <- sp_pca$loadings[,1:5]
loadings <- as.data.frame(loadings)
loadings$Symbol <- row.names(loadings)
loadings <- gather(loadings, 'Component', 'Weight', -Symbol)
loadings$Color = loadings$Weight > 0
graph <- ggplot(loadings, aes(x=Symbol, y=Weight, fill=Color)) +
geom_bar(stat='identity', position='identity', width=.75) +
facet_grid(Component ~ ., scales='free_y') +
guides(fill='none') +
ylab('Component Loading') +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_text(angle=90, vjust=0.5))
graph
ca_analysis <- ca(housetasks)
plot(ca_analysis)
if (!require(ggrepel)) {
install.packages('ggrepel')
library(ggrepel)
}
ca_analysis$sv ** 2
summary(ca_analysis)
contrib = ca_analysis$sv ** 2
contrib = contrib / sum(contrib)
colcoord = as.data.frame(ca_analysis$colcoord)
rowcoord = as.data.frame(ca_analysis$rowcoord)
coords = rbind(
cbind(rowcoord, type='rowcoord'),
cbind(colcoord, type='columns')
)
row.names(coords) <- gsub('_', ' ', row.names(coords))
graph <- ggplot(coords, aes(x=Dim1, y=Dim2, color=type, label=rownames(coords), shape=type)) +
geom_hline(yintercept=0, linetype='dotted', color='#444444') +
geom_vline(xintercept = 0, linetype='dotted', color='#444444') +
geom_point() +
geom_text_repel() +
xlab(sprintf('Dimension 1 (%.1f%%)', 100 * contrib[1])) +
ylab(sprintf('Dimension 2 (%.1f%%)', 100 * contrib[2])) +
scale_color_manual(values = c('blue', 'red')) +
theme_bw() +
theme(legend.position = "none")
graph
set.seed(1010103)
df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')]
km <- kmeans(df, centers=4, nstart=1)
df$cluster <- factor(km$cluster)
head(df)
centers <- data.frame(cluster=factor(1:4), km$centers)
centers
graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
geom_point() +
scale_shape_manual(values = c(1, 3, 2, 4),
guide = guide_legend(override.aes=aes(size=1))) +
geom_point(data=centers, aes(x=XOM, y=CVX), size=2, stroke=2, color='black') +
theme_bw() +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(-2, 2), ylim=c(-2.5, 2.5))
graph
syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms]
set.seed(10010)
km <- kmeans(df, centers=5, nstart=10)
km$size
centers <- as.data.frame(t(km$centers))
names(centers) <- paste('Cluster', 1:5)
centers$Symbol <- row.names(centers)
centers <- gather(centers, 'Cluster', 'Mean', -Symbol)
centers$Color = centers$Mean > 0
graph <- ggplot(centers, aes(x=Symbol, y=Mean, fill=Color)) +
geom_bar(stat='identity', position='identity', width=.75) +
facet_grid(Cluster ~ ., scales='free_y') +
guides(fill='none') +
ylab('Component Loading') +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_text(angle=90, vjust=0.5))
graph
pct_var <- data.frame(pct_var = 0,
num_clusters=2:14)
totalss <- kmeans(df, centers=14, nstart=50, iter.max=100)$totss
for (i in 2:14) {
kmCluster <- kmeans(df, centers=i, nstart=50, iter.max = 100)
pct_var[i-1, 'pct_var'] <- kmCluster$betweenss / totalss
}
graph <- ggplot(pct_var, aes(x=num_clusters, y=pct_var)) +
geom_line() +
geom_point() +
labs(y='% Variance Explained', x='Number of Clusters') +
scale_x_continuous(breaks=seq(2, 14, by=2)) +
theme_bw()
graph
syms1 <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX',
'XOM', 'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP',
'WMT', 'TGT', 'HD', 'COST')
df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms1]
d <- dist(t(df))
hcl <- hclust(d)
plot(hcl, ylab='distance', xlab='', sub='', main='')
cutree(hcl, k=4)
cluster_fun <- function(df, method)
{
d <- dist(df)
hcl <- hclust(d, method=method)
tree <- cutree(hcl, k=4)
df$cluster <- factor(tree)
df$method <- method
return(df)
}
df0 <- sp500_px[row.names(sp500_px) >= '2011-01-01', c('XOM', 'CVX')]
df <- rbind(cluster_fun(df0, method='single'),
cluster_fun(df0, method='average'),
cluster_fun(df0, method='complete'),
cluster_fun(df0, method='ward.D'))
df$method <- ordered(df$method, c('single', 'average', 'complete', 'ward.D'))
graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
geom_point(alpha=0.6) +
scale_shape_manual(values=c(1, 3, 4, 2),
guide=guide_legend(override.aes=aes(size=2))) +
facet_wrap( ~ method) +
theme_bw()
graph
mu <- c(.5, -.5)
sigma <- matrix(c(1, 1, 1, 2), nrow=2)
prob <- c(.5, .75, .95, .99) ## or whatever you want
names(prob) <- prob ## to get id column in result
x <- NULL
for (p in prob){
x <- rbind(x, ellipse(x=sigma, centre=mu, level=p))
}
df <- data.frame(x, prob=factor(rep(prob, rep(100, length(prob)))))
names(df) <- c('X', 'Y', 'Prob')
## Figure 7-9: Multivariate normal ellipses
dfmu <- data.frame(X=mu[1], Y=mu[2])
graph <- ggplot(df, aes(X, Y)) +
geom_path(aes(linetype=Prob)) +
geom_point(data=dfmu, aes(X, Y), size=3) +
theme_bw()
graph
df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')]
mcl <- Mclust(df)
summary(mcl)
cluster <- factor(predict(mcl)$classification)
graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) +
geom_point(alpha=.8) +
theme_bw() +
scale_shape_manual(values = c(1, 3),
guide = guide_legend(override.aes=aes(size=2)))
graph
summary(mcl, parameters=TRUE)$mean
summary(mcl, parameters=TRUE)$variance[,,1]
summary(mcl, parameters=TRUE)$variance[,,2]
plot(mcl, what='BIC', ask=FALSE, cex=.75)
defaults <- loan_data[loan_data$outcome=='default',]
df <- defaults[, c('loan_amnt', 'annual_inc', 'revol_bal', 'open_acc', 'dti', 'revol_util')]
km <- kmeans(df, centers=4, nstart=10)
centers <- data.frame(size=km$size, km$centers)
print(round(centers, digits=2))
df0 <- scale(df)
km0 <- kmeans(df0, centers=4, nstart=10)
centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale'))
centers0 <- scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE)
centers0 <- data.frame(size=km0$size, centers0)
print(round(centers0, digits=2))
km <- kmeans(df, centers=4, nstart=10)
centers <- data.frame(size=km$size, km$centers)
round(centers, digits=2)
syms <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM',
'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST')
top_15 <- sp500_px[row.names(sp500_px)>='2011-01-01', syms]
sp_pca1 <- princomp(top_15)
screeplot(sp_pca1, main='')
round(sp_pca1$loadings[,1:2], 3)
x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
x
daisy(x, metric='gower')
set.seed(301)
df <- loan_data[sample(nrow(loan_data), 250),
c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
d = daisy(df, metric='gower')
hcl <- hclust(d)
dnd <- as.dendrogram(hcl)
plot(dnd, leaflab='none', ylab='distance')
dnd_cut <- cut(dnd, h=.5)
df[labels(dnd_cut$lower[[4]]),]
df <- model.matrix(~ -1 + dti + payment_inc_ratio + home_ + pub_rec_zero, data=defaults)
df0 <- scale(df)
km0 <- kmeans(df0, centers=4, nstart=10)
centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale'))
round(scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE), 2)
df
x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>%
mutate(home_=as.factor(home_), purpose_=as.factor(purpose_))
x
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ PSFDS_IMAGE=psfds
JUPYTER_IMAGE=psfds_jupyter

jupyter:
docker run --rm -v $(PWD):/src -p 8893:8893 $(JUPYTER_IMAGE) jupyter notebook --allow-root --port=8893 --ip 0.0.0.0 --no-browser
docker run --rm -v $(PWD):/src -p 8893:8893 $(JUPYTER_IMAGE) jupyter lab --allow-root --port=8893 --ip 0.0.0.0 --no-browser

bash:
docker run -it --rm -v $(PWD):/src $(PSFDS_IMAGE) bash
Expand Down
16 changes: 8 additions & 8 deletions R/code/Chapter 2 - Data and sampling distributions.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,24 @@ dev.off()
## Sampling Distribution of a Statistic

# take a simple random sample
samp_data <- data.frame(income=sample(loans_income, 1000),
samp_data <- data.frame(income=sample(loans_income, 1000),
type='data_dist')

# take a sample of means of 5 values
samp_mean_05 <- data.frame(
income = tapply(sample(loans_income, 1000*5),
income = tapply(sample(loans_income, 1000*5),
rep(1:1000, rep(5, 1000)), FUN=mean),
type = 'mean_of_5')

# take a sample of means of 20 values
samp_mean_20 <- data.frame(
income = tapply(sample(loans_income, 1000*20),
income = tapply(sample(loans_income, 1000*20),
rep(1:1000, rep(20, 1000)), FUN=mean),
type = 'mean_of_20')

# bind the data.frames and convert type to a factor
income <- rbind(samp_data, samp_mean_05, samp_mean_20)
income$type <- factor(income$type,
income$type <- factor(income$type,
levels=c('data_dist', 'mean_of_5', 'mean_of_20'),
labels=c('Data', 'Mean of 5', 'Mean of 20'))

Expand Down Expand Up @@ -80,17 +80,16 @@ boot_ci <- boot.ci(boot_obj, conf=0.9, type='basic')
X <- data.frame(mean=boot_obj$t)
ci90 <- boot_ci$basic[4:5]
ci <- data.frame(ci=ci90, y=c(9, 11))
# ci <- boot_ci$basic[4:5]
ci
ggplot(X, aes(x=mean)) +
geom_histogram(bins=40, fill='#AAAAAA') +
geom_vline(xintercept=sampleMean, linetype=2) +
geom_path(aes(x=ci, y=10), data=ci, size=2) +
geom_path(aes(x=ci90[1], y=y), data=ci, size=2) +
geom_path(aes(x=ci90[2], y=y), data=ci, size=2) +
geom_text(aes(x=sampleMean, y=20, label='Sample mean'), size=6) +
geom_text(aes(x=sampleMean, y=8, label='90% interval'), size=6) +
theme_bw() +
annotate('text', x=sampleMean, y=20, label='Sample mean', size=6) +
annotate('text', x=sampleMean, y=8, label='90% interval', size=6) +
theme_bw() +
labs(x='', y='Counts')

## Normal Distribution
Expand Down Expand Up @@ -127,3 +126,4 @@ rexp(n=100, rate=.2)
### Weibull Distribution

rweibull(100, 1.5, 5000)

Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ imanishi <- read.csv(file.path(PSDS_PATH, 'data', 'imanishi_data.csv'))
## Resampling
### Example: Web Stickiness

graph <- ggplot(session_times, aes(x=Page, y=Time)) +
graph <- ggplot(session_times, aes(x=Page, y=Time)) +
geom_boxplot() +
labs(y='Time (in seconds)') +
labs(y='Time (in seconds)') +
theme_bw()
graph

Expand Down Expand Up @@ -87,7 +87,7 @@ t.test(Time ~ Page, data=session_times, alternative='less')

## ANOVA

graph <- ggplot(four_sessions, aes(x=Page, y=Time)) +
graph <- ggplot(four_sessions, aes(x=Page, y=Time)) +
geom_boxplot() +
labs(y='Time (in seconds)') +
theme_bw()
Expand Down Expand Up @@ -149,3 +149,4 @@ pwr.2p.test(h=effect_size, sig.level=0.05, power=0.8, alternative='greater')

effect_size = ES.h(p1=0.0165, p2=0.011)
pwr.2p.test(h=effect_size, sig.level=0.05, power=0.8, alternative='greater')

Loading

0 comments on commit eae1988

Please sign in to comment.