tdata.FPKM.sample.info <- readRDS(here("Data","20190406_RNAseq_B6_4wk_2DG_counts_phenotypes.RData"))
tdata.FPKM <- readRDS(here("Data","20190406_RNAseq_B6_4wk_2DG_counts_numeric.RData"))
log.tdata.FPKM <- log(tdata.FPKM + 1)
log.tdata.FPKM <- as.data.frame(log.tdata.FPKM)
log.tdata.FPKM.sample.info <- cbind(log.tdata.FPKM, tdata.FPKM.sample.info[,27238:27240])
log.tdata.FPKM.sample.info <- log.tdata.FPKM.sample.info %>% rownames_to_column() %>% filter(rowname != "A113") %>% column_to_rownames()
log.tdata.FPKM.subset <- log.tdata.FPKM[,colMeans(log.tdata.FPKM != 0) > 0.5]
log.tdata.FPKM.sample.info.subset <- cbind(log.tdata.FPKM.subset,tdata.FPKM.sample.info[,27238:27240])
log.tdata.FPKM.sample.info.subset <- log.tdata.FPKM.sample.info.subset %>% rownames_to_column() %>% filter(rowname != "A113") %>% column_to_rownames()
log.tdata.FPKM.sample.info.subset.hip.hyp.cortex <- log.tdata.FPKM.sample.info.subset %>% rownames_to_column() %>% filter(Tissue %in% c("Hippocampus", "Hypothanamus","Pre-frontal Cortex")) %>% column_to_rownames()
I will use Mouse IDs, Tissue, Treatment, and Time to keep track of the values in the matrices. All other covariates will be discarded.
# Set rownames by mouse ID and tissue
rownames(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex) <- paste0(rownames(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex),":", log.tdata.FPKM.sample.info.subset.hip.hyp.cortex$Time, ":", log.tdata.FPKM.sample.info.subset.hip.hyp.cortex$Treatment, ":", log.tdata.FPKM.sample.info.subset.hip.hyp.cortex$Tissue)
# Discard covariates from columns 17333-17336
log.tdata.FPKM.sample.info.subset.hip.hyp.cortex <- log.tdata.FPKM.sample.info.subset.hip.hyp.cortex[,-(17333:17335)]
head(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex[,1:5])
## ENSMUSG00000000001 ENSMUSG00000000028
## A007:4 wks:None:Hypothanamus 2.474856 0.7608058
## A008:4 wks:None:Hippocampus 2.889816 0.7129498
## A009:4 wks:None:Pre-frontal Cortex 2.636912 0.6259384
## A016:96 hrs:2DG:Hypothanamus 2.819592 0.4762342
## A017:96 hrs:2DG:Hippocampus 2.731115 1.1847900
## A018:96 hrs:2DG:Pre-frontal Cortex 3.065258 0.7129498
## ENSMUSG00000000031 ENSMUSG00000000037
## A007:4 wks:None:Hypothanamus 0.1222176 0.3435897
## A008:4 wks:None:Hippocampus 0.2776317 0.3148107
## A009:4 wks:None:Pre-frontal Cortex 0.1655144 0.2700271
## A016:96 hrs:2DG:Hypothanamus 0.6881346 0.9360934
## A017:96 hrs:2DG:Hippocampus 0.1133287 0.2468601
## A018:96 hrs:2DG:Pre-frontal Cortex 0.1310283 0.4121097
## ENSMUSG00000000049
## A007:4 wks:None:Hypothanamus 0.5128236
## A008:4 wks:None:Hippocampus 0.6471032
## A009:4 wks:None:Pre-frontal Cortex 0.6097656
## A016:96 hrs:2DG:Hypothanamus 0.2700271
## A017:96 hrs:2DG:Hippocampus 0.3364722
## A018:96 hrs:2DG:Pre-frontal Cortex 0.8372475
WGCNA will have poor results if the data have too many missing values. I checked if any metabolites fall into this category.
log.tdata.FPKM.sample.info.subset.hip.hyp.cortex.missing <- missing(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex)
cat("logFPKM: ", goodSamplesGenes(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex.missing, verbose=0)$allOK, "\n")
## logFPKM: TRUE
WGCNA reports that all data are good! I now use hierarchical clustering to detect any obvious outliers. I did not see any particularly egregious outliers.
sampleclustering(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex.missing)
saveRDS(log.tdata.FPKM.sample.info.subset.hip.hyp.cortex.missing, here("Data","Brain","log.tdata.FPKM.sample.info.subset.hip.hyp.cortex.missing.WGCNA.RData"))
Analysis performed by Ann Wells
The Carter Lab The Jackson Laboratory 2023
ann.wells@jax.org