CPTR9_initial_qc_markdown.Rmd

---
title: "CPTR-9 Roberto Weigert"
output: html_document
date: "May 2, 2024"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r include=FALSE}

# Install DSPWorkflow package

install.DSP <- FALSE
if(install.DSP == TRUE){
  library(devtools)
  install_github("NIDAP-Community/DSPWorkflow", ref = "dev")
}

library(DSPWorkflow)
library(GeomxTools)
library(dplyr)
library(knitr)
library(ggplot2)
library(ggforce)
library(gt)
library(stringr)
library(PCAtools)
library(hues)
library(scales)
library(Polychrome)
library(grid)
library(gridExtra)


```

``` {r include=FALSE}
# Load all inputs

project.folder <- "/Users/cauleyes/CPTR/CPTR-9_Roberto_Weigert/"

dcc.files <- list.files(file.path(paste0(project.folder, "dcc")),
  pattern = ".dcc$",
  full.names = TRUE,
  recursive = TRUE
)

pkc.files <- c("Mm_R_NGS_WTA_v1.0.pkc")

pkc.file.path <- paste0(project.folder, "Mm_R_NGS_WTA_v1.0.pkc")


annotation.file.path <- paste0(project.folder, "CPTR9_Weigert_annotation.xlsx")

# Samples identified as clustering away from other sampls
high.PC.samples <-  c("DSP-1001660013739-B-B02.dcc", 
                      "DSP-1001660013739-B-B12.dcc", 
                      "DSP-1001660013739-B-C05.dcc", 
                      "DSP-1001660013739-B-C06.dcc", 
                      "DSP-1001660013739-B-D09.dcc", 
                      "DSP-1001660013739-B-D12.dcc", 
                      "DSP-1001660013739-B-E03.dcc", 
                      "DSP-1001660013739-B-E08.dcc", 
                      "DSP-1001660013739-B-E10.dcc", 
                      "DSP-1001660013739-B-F03.dcc", 
                      "DSP-1001660022226-A-D10.dcc", 
                      "DSP-1001660022226-A-E02.dcc", 
                      "DSP-1001660022226-A-E03.dcc", 
                      "DSP-1001660022226-A-E07.dcc", 
                      "DSP-1001660022226-A-F10.dcc", 
                      "DSP-1001660022226-A-F11.dcc", 
                      "DSP-1001660022226-A-H11.dcc")

# Create the grep text to search for all dcc files to remove
high.PC.sample.grep <- paste(high.PC.samples, collapse = "|")

# Create a new list with high PC1 dccs removed
filtered.dcc.files <- dcc.files[!grepl(high.PC.sample.grep, dcc.files)]


```

```{r include=FALSE}

# Save the output from the study design function into a list
sdesign.list <- studyDesign(dcc.files = filtered.dcc.files, 
                                pkc.files = pkc.file.path,
                                pheno.data.file = annotation.file.path,
                                pheno.data.sheet = "annotation",
                                pheno.data.dcc.col.name = "Sample_ID",
                                protocol.data.col.names = c("aoi", "roi"),
                                experiment.data.col.names = c("panel"),
                                slide.name.col = "slide name", 
                                class.col = "class", 
                                region.col = "region", 
                                segment.col = "segment",
                                area.col = "area",
                                nuclei.col = "nuclei", 
                                sankey.exclude.slide = FALSE, 
                                segment.id.length = 10)

```

# Sankey Plot

```{r Sankey Plot, echo=FALSE, error=FALSE, warning=FALSE}


object <- sdesign.list$object

# Define the lanes of the Sankey plot
lane1 <- "slide_name_short"
lane2 <- "region"
lane3 <- "segment"
fill_lane <- "region"


#Establish variables for the Sankey plot
x <- id <- y <- n <- NULL

# select the annotations we want to show, use `` to surround column
# names with spaces or special symbols

# Create a count matrix
count.mat <- count(pData(object), 
                   !!as.name(lane1), 
                   !!as.name(lane2), 
                   !!as.name(lane3))

# Remove any rows with NA values
na.per.column <- colSums(is.na(count.mat))
na.total.count <- sum(na.per.column)
                                             
if(na.total.count > 0){
  count.mat <- count.mat[!rowSums(is.na(count.mat)),]
  rownames(count.mat) <- 1:nrow(count.mat)
}
 

# Gather the data and plot in order: lane 1, lane 2, ..., lane n
# gather_set_data creates x, id, y, and n fields within sankey.count.data
# Establish the levels of the Sankey
sankey.count.data <- gather_set_data(count.mat, 1:3)

# Define the annotations to use for the Sankey x axis labels
sankey.count.data$x[sankey.count.data$x == 1] <- "slide_name_short"
sankey.count.data$x[sankey.count.data$x == 2] <- "region"
sankey.count.data$x[sankey.count.data$x == 3] <- "segment"

sankey.count.data$x <-
    factor(
      sankey.count.data$x,
      levels = c(as.name(lane1), as.name(lane2), as.name(lane3)))
    
# For position of Sankey 100 segment scale
adjust.scale.pos = 0

# plot Sankey diagram
sankey.plot <-
  ggplot(sankey.count.data,
         aes(
           x,
           id = id,
           split = y,
           value = n
         )) +
  geom_parallel_sets(aes(fill = !!as.name(fill_lane)), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "gray",
                            size = 5,
                            angle = 0) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "bottom",
    axis.ticks.y = element_blank(),
    axis.line = element_blank(),
    axis.text.y = element_blank()
  ) +
  scale_y_continuous(expand = expansion(0)) +
  scale_x_discrete(expand = expansion(0)) +
  labs(x = "", y = "") +
  annotate(
    geom = "segment",
    x = (3.25 - adjust.scale.pos),
    xend = (3.25 - adjust.scale.pos),
    y = 20,
    yend = 120,
    lwd = 2
  ) +
  annotate(
    geom = "text",
    x = (3.19 - adjust.scale.pos),
    y = 70,
    angle = 90,
    size = 5,
    hjust = 0.5,
    label = "100 segments"
  )

print(sankey.plot)

```


# QC Preprocessing

```{r QC Preprocessing, echo=FALSE, error=FALSE, warning=FALSE}

qc.output <-  qcProc(object = sdesign.list$object,
                        min.segment.reads = 1000, 
                        percent.trimmed = 80,    
                        percent.stitched = 80,   
                        percent.aligned = 80,    
                        percent.saturation = 50, 
                        min.negative.count = 1,   
                        max.ntc.count = 1000,     
                        min.nuclei = 1,         
                        min.area = 10,
                        print.plots = TRUE)
    print(qc.output$segments.qc)
    
    # Identify the flag columns
    flag.column.detect <- sapply(qc.output$segment.flags, is.logical)
    flag.column.names <- names(qc.output$segment.flags[flag.column.detect])
    
    # A function for coloring TRUE flags as red
    red.flag <- function(x) {
      x <- as.logical(x)
      ifelse(x, "red", "white") 
      }
    
    # Create a table for the segment flags
    segment.flag.gt <- gt(qc.output$segment.flags) %>% 
      data_color(columns = flag.column.names, 
                 colors = red.flag, 
                 alpha = 0.7)
    
    # Create an HTML table for segment flags
    #gtsave(segment.flag.gt, "segment_flag_table.html")
    
    # Create an HTML table for probe flags
    probe.flag.gt <- gt(qc.output$probe.flags)
    #gtsave(segment.flag.gt, "segment_flag_table.html")
    
    
  # Export the flags table
    
  export.flags <- FALSE
  
  if(export.flags == TRUE){
    
    write.csv(qc.output$segment.flags, file =  paste0(project.folder, "qc/segment_qc_flags.csv"))
    
    write.csv(qc.output$probe.flags, file =  paste0(project.folder, "qc/probe_qc_flags.csv"))
    
  }
    
```

# 3. Filtering

### Segment Filtering by Gene Detection

```{r Filtering by Gene Detection, echo=FALSE, error=FALSE, warning=FALSE}


object <- qc.output$object

# Set up lists of segment IDs
segment.list.total <- pData(object)$segmentID

# Define Modules
modules <- gsub(".pkc", "", pkc.files)

# Calculate limit of quantification (LOQ) in each segment
# LOQ = geomean(NegProbes) * geoSD(NegProbes)^(LOQ cutoff)
# LOQ is calculated for each module (pkc file)
loq <- data.frame(row.names = colnames(object))

loq.min <- 2
loq.cutoff <- 2

for(module in modules) {
  vars <- paste0(c("NegGeoMean_", "NegGeoSD_"),
                 module)
  if(all(vars[1:2] %in% colnames(pData(object)))) {
    
    neg.geo.mean <- vars[1]
    neg.geo.sd <- vars[2]
    
    loq[, module] <-
      pmax(loq.min,
           pData(object)[, neg.geo.mean] * 
             pData(object)[, neg.geo.sd] ^ loq.cutoff)
  }
}

# Store the loq df in the annotation df
pData(object)$loq <- loq

# Setup a master loq matrix
loq.mat <- c()


for(module in modules) {
  # Gather rows with the given module
  ind <- fData(object)$Module == module
  
  # Check if each feature has counts above the LOQ
  mat.i <- t(esApply(object[ind, ], MARGIN = 1,
                     FUN = function(x) {
                       x > loq[, module]
                     }))
  
  # Store results in the master loq matrix
  loq.mat <- rbind(loq.mat, mat.i)
}

# ensure ordering since this is stored outside of the geomxSet
loq.mat <- loq.mat[fData(object)$TargetName, ]

# Evaluate and Filter Segment Gene Detection Rate
# Save detection rate information to pheno data
pData(object)$GenesDetected <- colSums(loq.mat, na.rm = TRUE)
pData(object)$GeneDetectionRate <- 100*(pData(object)$GenesDetected / nrow(object))

# Establish detection bins
detection.bins <- c("less_than_1", "1_5", "5_10", "10_15", "greater_than_15")

# Determine detection thresholds: 1%, 5%, 10%, 15%, >15%
pData(object)$DetectionThreshold <- 
  cut(pData(object)$GeneDetectionRate,
      breaks = c(0, 1, 5, 10, 15, 100),
      labels = detection.bins)

# stacked bar plot of different cut points (1%, 5%, 10%, 15%)
segment.stacked.bar.plot <- ggplot(pData(object),
                          aes(x = DetectionThreshold)) +
  geom_bar(aes(fill = region)) +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_bw() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(x = "Gene Detection Rate",
         y = "Segments, #",
         fill = "Segment Type")

print(segment.stacked.bar.plot)
  
# cut percent genes detected at 1, 5, 10, 15
segment.table <- kable(table(pData(object)$DetectionThreshold, 
                             pData(object)$class))

# Make a list of segments with low detection
low.detection.segments <- pData(object) %>% 
  filter(GeneDetectionRate < 5) %>% 
  select(any_of(c("segmentID", "GeneDetectionRate")))
rownames(low.detection.segments) <- NULL

print(low.detection.segments)

# Export a summary of the segment gene detection
segment.detection.summary <- pData(object) %>% 
  select(any_of(c("segmentID", "GeneDetectionRate", "DetectionThreshold")))

export.segment.detection.summary <- TRUE

if(export.segment.detection.summary == TRUE){
  
  write.csv(segment.detection.summary, paste0(project.folder, "qc/segment_detection_summary.csv"))
  
}

```

```{r include=FALSE}

# Filter the data using the cutoff for gene detection rate
segment.gene.rate.cutoff <- 0

object.segment.filtered <-
    object[, pData(object)$GeneDetectionRate >= segment.gene.rate.cutoff]


```

### Gene Filtering by Detection per Segment

```{r Filtering by Detection per Segment, echo=FALSE, error=FALSE, warning=FALSE}
library(scales)

# Evaluate and Filter Study-wide Gene Detection Rate 
# Calculate detection rate:
loq.mat <- loq.mat[, colnames(object.segment.filtered)]

fData(object.segment.filtered)$DetectedSegments <- rowSums(loq.mat, na.rm = TRUE)
fData(object.segment.filtered)$DetectionRate <-
  100*(fData(object.segment.filtered)$DetectedSegments / nrow(pData(object)))

# Establish detection bins
detection.bins <- c("0", "less_than_1", "1_5", "5_10", "10_20", "20_30", "30_40", "40_50", "greater_than_50")

# Determine detection thresholds: 1%, 5%, 10%, 15%, >15%
fData(object.segment.filtered)$DetectionThreshold <- 
  cut(fData(object.segment.filtered)$DetectionRate,
      breaks = c(-1, 0, 1, 5, 10, 20, 30, 40, 50, 100),
      labels = detection.bins)


gene.stacked.bar.plot <- ggplot(fData(object.segment.filtered),
                          aes(x = DetectionThreshold)) +
  geom_bar(aes(fill = Module)) +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_bw() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(x = "Gene Detection Rate",
         y = "Genes, #",
         fill = "Probe Set")

print(gene.stacked.bar.plot)

# Gene of interest detection table
goi <- c("A2m", "Cd44")

goi.table <- data.frame(Gene = goi,
                        Number = fData(object.segment.filtered)[goi, "DetectedSegments"],
                        DetectionRate = fData(object.segment.filtered)[goi, "DetectionRate"])
#print(goi.table)

# Plot detection rate:
plot.detect <- data.frame(Freq = c(1, 5, 10, 20, 30, 50))
plot.detect$Number <-
  unlist(lapply(c(1, 5, 10, 20, 30, 50),
                function(x) {sum(fData(object.segment.filtered)$DetectionRate >= x)}))

plot.detect$Rate <- plot.detect$Number / nrow(fData(object.segment.filtered))
rownames(plot.detect) <- plot.detect$Freq

genes.detected.plot <- ggplot(plot.detect, aes(x = as.factor(Freq), y = Rate, fill = Rate)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = formatC(Number, format = "d", big.mark = ",")),
            vjust = 1.6, color = "black", size = 4) +
  scale_fill_gradient2(low = "orange2", mid = "lightblue",
                       high = "dodgerblue3", midpoint = 0.65,
                       limits = c(0,1),
                       labels = scales::percent) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0,1),
                     expand = expansion(mult = c(0, 0))) +
  labs(x = "% of Segments",
       y = "Genes Detected, % of Panel > loq")

print(genes.detected.plot)


# Export a summary of the gene detection
gene.detection.summary <- fData(object.segment.filtered) %>% 
  select(any_of(c("segmentID", "DetectionRate", "DetectionThreshold")))

export.gene.detection.summary <- FALSE

if(export.gene.detection.summary == TRUE){
  
  write.csv(gene.detection.summary, paste0(project.folder, "qc/gene_detection_summary.csv"))
  
  ggsave(paste0(project.folder, "qc/gene_detection_plot.pdf"), 
         genes.detected.plot)
  ggsave(paste0(project.folder, "qc/gene_detection_plot_binned.pdf"), 
         gene.stacked.bar.plot)
  
}

```

```{r include=FALSE}

# Set the cutoff for gene detection
study.gene.rate.cutoff <- 0.00

# Subset for genes above the study gene detection rate cutoff
# Manually include the negative control probe, for downstream use
negative.probe.fData <- subset(fData(object.segment.filtered), CodeClass == "Negative")
neg.probes <- unique(negative.probe.fData$TargetName)
object.gene.filtered <- object.segment.filtered[fData(object.segment.filtered)$DetectionRate >= study.gene.rate.cutoff |
                   fData(object.segment.filtered)$TargetName %in% neg.probes, ]

```

# 4. Normalization:
  
```{r Normalization, echo=FALSE, error=FALSE, warning=FALSE}
  
    q3.normalization.output <- geomxNorm(
                                  object = object.gene.filtered, 
                                  norm = "q3")
    
    print(q3.normalization.output$multi.plot)
    print(q3.normalization.output$boxplot.raw)
    print(q3.normalization.output$boxplot.norm)
    
    neg.normalization.output <- geomxNorm(
                                  object = object.gene.filtered, 
                                  norm = "neg")
  
    print(neg.normalization.output$boxplot.raw)
    print(neg.normalization.output$boxplot.norm)
```

# PCA with PCATools

### Setup

```{r PCA Setup, echo=FALSE, error=FALSE, warning=FALSE}

# See reference vignette: https://bioconductor.org/packages/release/bioc/vignettes/PCAtools/inst/doc/PCAtools.html#introduction

# Load the Geomx object
object <- q3.normalization.output$object

# Gather the the normalized counts
norm.counts.df <- as.data.frame(object@assayData$q_norm)

# Convert counts to log2
log.counts.df <- norm.counts.df %>% 
  mutate_all(~ log2(.)) %>% 
  rename_all(~ gsub("\\.dcc", "", .))

# Remove the negative controls from the log counts
control.probes <- c("NegProbe-WTX")
log.counts.df <- log.counts.df[!(rownames(log.counts.df) %in% control.probes), ]

# Load the annotation
annotation <- pData(object)

# Remove NTCs
cleaned.annotation.df <- as.data.frame(annotation[annotation$'slide_name' != "No Template Control", ])

# Order of rownames of annotation need to match columns of count data
cleaned.annotation.df <- cleaned.annotation.df[order(rownames(cleaned.annotation.df)), ]

# Add categories for nuclei and area bins
cleaned.annotation.df <- cleaned.annotation.df %>%
  mutate(nuclei.bin = as.factor(ntile(nuclei, 10))) %>% 
  mutate(area.bin = as.factor(ntile(area, 10)))


log.counts.df <- log.counts.df[order(colnames(log.counts.df))]

# Remove .dcc from Sample ID row names
cleaned.annotation.df <- cleaned.annotation.df %>% `rownames<-`(sub("\\.dcc", "", rownames(.)))

```

### Run PCA

```{r PCA, echo=FALSE, error=FALSE, warning=FALSE}

# Generate a PCA table for all samples
pca.table <- pca(log.counts.df, 
                 metadata = cleaned.annotation.df, 
                 removeVar = 0.1)


# Create biplots for all samples, colored by each annotation

# Segment
pca.plot.segment <- biplot(pca.table, 
                         colby = "segment", 
                         legendPosition = "right", 
                         legendLabSize = 10, 
                         legendIconSize = 5, 
                         lab = NULL,
                         title = "All samples", 
                         subtitle = "NTCs removed")

print(pca.plot.segment)

# Region
pca.plot.region <- biplot(pca.table, 
                         colby = "region", 
                         legendPosition = "right", 
                         legendLabSize = 10, 
                         legendIconSize = 5, 
                         lab = NULL,
                         title = "All samples", 
                         subtitle = "NTCs removed")

print(pca.plot.region)

# Cell Number Category
pca.plot.cell_num_cat <- biplot(pca.table, 
                         colby = "cell_number_category", 
                         legendPosition = "right", 
                         legendLabSize = 10, 
                         legendIconSize = 5, 
                         lab = NULL,
                         title = "All samples", 
                         subtitle = "NTCs removed")

print(pca.plot.cell_num_cat)

# Detection Threshold
pca.plot.gene_detection <- biplot(pca.table, 
                         colby = "DetectionThreshold", 
                         legendPosition = "right", 
                         legendLabSize = 10, 
                         legendIconSize = 5, 
                         lab = NULL,
                         title = "All samples", 
                         subtitle = "NTCs removed")

print(pca.plot.gene_detection)

# Generate colors for bins

# Define extreme colors for 1 and 10
color_10 <- "#FF0000"
#color_5 <- "#" 
color_1 <- "#0000FF"

# Create color palette function
palette <- colorRampPalette(c(color_1, color_10))

# Generate colors for values from 1 to 10
bin.colors <- palette(10)

bin.colors.legend <- setNames(bin.colors, 1:10)

# Nuclei Count bin
pca.plot.nuclei <- biplot(pca.table, 
                          colby = "nuclei.bin", 
                          colkey = bin.colors.legend, 
                          legendPosition = "right", 
                          legendLabSize = 10, 
                          legendIconSize = 5, 
                          lab = NULL,
                          title = "All samples by nuclei", 
                          subtitle = "NTCs removed")

print(pca.plot.nuclei)

# Area bin
pca.plot.area <- biplot(pca.table, 
                        colby = "area.bin", 
                        colkey = bin.colors.legend, 
                        legendPosition = "right", 
                        legendLabSize = 10, 
                        legendIconSize = 5, 
                        lab = NULL,
                        title = "All samples by area", 
                        subtitle = "NTCs removed")

print(pca.plot.area)


export.biplots <- FALSE

if(export.biplots == TRUE){ 
  
  # Segment
  ggsave(filename = paste0(project.folder, "pca/pca_segment.pdf"), 
         plot = pca.plot.segment, 
         device = "pdf")
  
  # Region
  ggsave(filename = paste0(project.folder, "pca/pca_region.pdf"), 
         plot = pca.plot.region, 
         device = "pdf")
  
  # Cell number category
  ggsave(filename = paste0(project.folder, "pca/pca_cell_number_category.pdf"), 
         plot = pca.plot.cell_num_cat, 
         device = "pdf")
  
  # Gene detection
  ggsave(filename = paste0(project.folder, "pca/pca_gene_detection.pdf"), 
         plot = pca.plot.gene_detection, 
         device = "pdf")
  
  # Nuclei
  ggsave(filename = paste0(project.folder, "pca/pca_nuclei_bin.pdf"), 
         plot = pca.plot.nuclei, 
         device = "pdf")
  
  # Area
  ggsave(filename = paste0(project.folder, "pca/pca_area_bin.pdf"), 
         plot = pca.plot.area, 
         device = "pdf")
  
  }

```

```{r, include=FALSE}

# Grab the PC data for high PC1 
pc.data <- pca.table$rotated

high.pc1 <- pc.data %>% 
  filter(PC1 > 50)

high.pc1.samples <- rownames(high.pc1)

# Grab the annotation for the high PC1 samples
high.pc1.anno <- annotation[high.pc1.samples, ]

high.pc1.anno <- high.pc1.anno %>% 
  select(segmentID, 
         class, 
         region, 
         segment, 
         cell_number_category, 
         nuclei, 
         area, 
         DetectionThreshold)

export.high.pc.samples <- FALSE
if(export.high.pc.samples == TRUE){ 

  write.csv(high.pc1.anno, file = paste0(project.folder, 
                                         "qc/high.pc1.samples.csv"))  
  
}


```


## Nuclei count versus gene threshold

```{r}


cleaned.annotation.df$region_segment <- paste0(cleaned.annotation.df$region, cleaned.annotation.df$segment)

region.segments <- unique(cleaned.annotation.df$region_segment)
region.segment.colors <- unname(createPalette(length(region.segments), 
                                            c("#ff0000", "#00ff00", "#0000ff"), 
                                            M = 1000, 
                                            range = c(10,70)))

nuclei.genes.plot.combined.shape <- ggplot(cleaned.annotation.df, aes(x = GeneDetectionRate, y = nuclei, color = segment, shape = region)) + 
  geom_point() + 
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10), limits = c(0,600)) + 
  scale_color_manual(values=region.segment.colors) + 
  theme(legend.key.size = unit(0.3, "cm"),  
        legend.text = element_text(size = 6), 
        legend.title = element_text(size = 7))

nuclei.genes.plot.combined <- ggplot(cleaned.annotation.df, aes(x = GeneDetectionRate, y = nuclei, color = region_segment)) + 
  geom_point() + 
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10), limits = c(0,600)) + 
  theme(legend.key.size = unit(0.3, "cm"),  
        legend.text = element_text(size = 6), 
        legend.title = element_text(size = 7))


combined.plots <- grid.arrange(nuclei.genes.plot.combined, 
                               nuclei.genes.plot.combined.shape, 
                               nrow = 1)

grid.draw(combined.plots)

ggsave(paste0(project.folder, "genes_v_nuclei_combined_ylim.png"), 
       combined.plots, 
       height = 8,
       width = 14)

nuclei.genes.plot.free.scales <- ggplot(cleaned.annotation.df, aes(x = GenesDetected, y = nuclei, color = segment)) + 
  geom_point() + 
  facet_wrap(~region, scales = "free")

nuclei.genes.plot <- ggplot(cleaned.annotation.df, aes(x = GenesDetected, y = nuclei, color = segment)) + 
  geom_point() + 
  facet_wrap(~region) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10), limits = c(0,600)) + 
  theme(legend.key.size = unit(0.3, "cm"),  
        legend.text = element_text(size = 6), 
        legend.title = element_text(size = 7), 
        axis.text = element_text(size = 6), 
        axis.text.x = element_text(angle = 45, hjust = 1))

nuclei.gene.rate.plot <- ggplot(cleaned.annotation.df, aes(x = GeneDetectionRate, y = nuclei, color = segment)) + 
  geom_point() + 
  facet_wrap(~region) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10), limits = c(0,600)) + 
  theme(legend.key.size = unit(0.3, "cm"),  
        legend.text = element_text(size = 6), 
        legend.title = element_text(size = 7), 
        axis.text = element_text(size = 6))

nuclei.gene.threshold.plot <- ggplot(cleaned.annotation.df, aes(x = DetectionThreshold, y = nuclei, color = segment)) + 
  geom_point() + 
  facet_wrap(~region, scales = "free")

facet.plots <- grid.arrange(nuclei.genes.plot, 
                               nuclei.gene.rate.plot, 
                               nrow = 1)

grid.draw(facet.plots)

ggsave(paste0(project.folder, "genes_v_nuclei_faceted_ylim.png"), 
       facet.plots, 
       height = 8,
       width = 14)

```