tdata.FPKM.sample.info <- readRDS(here("Data","20190406_RNAseq_B6_4wk_2DG_counts_phenotypes.RData"))
tdata.FPKM <- readRDS(here("Data","20190406_RNAseq_B6_4wk_2DG_counts_numeric.RData"))
log.tdata.FPKM <- log(tdata.FPKM + 1)
log.tdata.FPKM <- as.data.frame(log.tdata.FPKM)
log.tdata.FPKM.sample.info <- cbind(log.tdata.FPKM, tdata.FPKM.sample.info[,27238:27240])
log.tdata.FPKM.sample.info <- log.tdata.FPKM.sample.info %>% rownames_to_column() %>% filter(rowname != "A113") %>% column_to_rownames()
log.tdata.FPKM.subset <- log.tdata.FPKM[,colMeans(log.tdata.FPKM != 0) > 0.5]
log.tdata.FPKM.sample.info.subset <- cbind(log.tdata.FPKM.subset,tdata.FPKM.sample.info[,27238:27240])
log.tdata.FPKM.sample.info.subset <- log.tdata.FPKM.sample.info.subset %>% rownames_to_column() %>% filter(rowname != "A113") %>% column_to_rownames()
log.tdata.FPKM.sample.info.subset.kidney <- log.tdata.FPKM.sample.info.subset %>% rownames_to_column() %>% filter(Tissue == "Kidney") %>% column_to_rownames()
I will use Mouse IDs Treatment, and Time to keep track of the values in the matrices. All other covariates will be discarded.
# Set rownames by mouse ID and tissue
rownames(log.tdata.FPKM.sample.info.subset.kidney) <- paste0(rownames(log.tdata.FPKM.sample.info.subset.kidney),":", log.tdata.FPKM.sample.info.subset.kidney$Time, ":", log.tdata.FPKM.sample.info.subset.kidney$Treatment)
# Discard covariates from columns 17333-17336
log.tdata.FPKM.sample.info.subset.kidney <- log.tdata.FPKM.sample.info.subset.kidney[,-(17333:17336)]
head(log.tdata.FPKM.sample.info.subset.kidney[,1:5])
## ENSMUSG00000000001 ENSMUSG00000000028 ENSMUSG00000000031
## A002:4 wks:None 3.378611 0.6981347 0.12221763
## A011:96 hrs:2DG 3.544143 0.4762342 0.11332869
## A020:4 wks:2DG 3.624874 0.7793249 0.30748470
## A029:4 wks:2DG 3.531933 0.5364934 0.22314355
## A038:4 wks:None 3.601686 0.8197798 0.00000000
## A047:4 wks:None 3.419692 0.6626880 0.07696104
## ENSMUSG00000000037 ENSMUSG00000000049
## A002:4 wks:None 0.29266961 2.841998
## A011:96 hrs:2DG 0.06765865 3.109953
## A020:4 wks:2DG 0.04879016 2.794839
## A029:4 wks:2DG 0.11332869 2.520917
## A038:4 wks:None 0.00000000 2.778198
## A047:4 wks:None 0.08617770 2.867899
WGCNA will have poor results if the data have too many missing values. I checked if any metabolites fall into this category.
log.tdata.FPKM.sample.info.subset.kidney.missing <- missing(log.tdata.FPKM.sample.info.subset.kidney)
cat("logFPKM: ", goodSamplesGenes(log.tdata.FPKM.sample.info.subset.kidney.missing, verbose=0)$allOK, "\n")
## logFPKM: TRUE
WGCNA reports that all data are good! I now use hierarchical clustering to detect any obvious outliers. I did not see any particularly egregious outliers.
sampleclustering(log.tdata.FPKM.sample.info.subset.kidney.missing)
saveRDS(log.tdata.FPKM.sample.info.subset.kidney.missing, here("Data","Kidney","log.tdata.FPKM.sample.info.subset.kidney.missing.WGCNA.RData"))
Analysis performed by Ann Wells
The Carter Lab The Jackson Laboratory 2023
ann.wells@jax.org