Loading 02_code/R/230620_JB_AssignmentSolution.Rdeleted 100644 → 0 +0 −65 Original line number Diff line number Diff line # Clear workspace and graphs if(!is.null(dev.list())) dev.off() rm(list = ls()) library("caret") library("data.table") # Create the training and test datasets set.seed(100) load(file="01_data/biofilm.Rdata") setDT(biofilm_data) #Descriptive: There are several files in the dataset file_list <- unique(biofilm_data$file) # Let's get one out of it onefile <- biofilm_data[file_list[2]==biofilm_data$file,] # Let's graph the mz values against the counts plot(onefile$mz,onefile$int,pch=16,type="h") # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Splitting based on outcome variables ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 1: Get row numbers for the training data ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ biofilm_data$mz <- round(biofilm_data$mz,digits=1) setkeyv(biofilm_data,cols=c("file","indicator","mz")) # This step takes a long time biofilm_data[,.(int=sum(int)), by=c("file","indicator","mz")] biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int") biofilm_data_wide <- biofilm_data_wide[,-c("file")] pcas <- prcomp(biofilm_data_wide) trainRowNumbers <- createDataPartition(as.factor(biofilm_data_wide$indicator), p=0.8, list=FALSE,times=1) # list: no list is returned # times: you can do multiple times # if the output is a factor splitting is done so that the the outcome variable is equally represented # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 2: Create the training dataset ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ trainData <- biofilm_data_wide[trainRowNumbers,] # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 3: Create the test dataset ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ testData <- biofilm_data_wide[-trainRowNumbers,] # What are the frequencies? table(biofilm_data$indicator)/length(biofilm_data$indicator) table(trainData$indicator)/length(trainData$indicator) # The outcome variable Y for later use. y = trainData$indicator Loading
02_code/R/230620_JB_AssignmentSolution.Rdeleted 100644 → 0 +0 −65 Original line number Diff line number Diff line # Clear workspace and graphs if(!is.null(dev.list())) dev.off() rm(list = ls()) library("caret") library("data.table") # Create the training and test datasets set.seed(100) load(file="01_data/biofilm.Rdata") setDT(biofilm_data) #Descriptive: There are several files in the dataset file_list <- unique(biofilm_data$file) # Let's get one out of it onefile <- biofilm_data[file_list[2]==biofilm_data$file,] # Let's graph the mz values against the counts plot(onefile$mz,onefile$int,pch=16,type="h") # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Splitting based on outcome variables ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 1: Get row numbers for the training data ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ biofilm_data$mz <- round(biofilm_data$mz,digits=1) setkeyv(biofilm_data,cols=c("file","indicator","mz")) # This step takes a long time biofilm_data[,.(int=sum(int)), by=c("file","indicator","mz")] biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int") biofilm_data_wide <- biofilm_data_wide[,-c("file")] pcas <- prcomp(biofilm_data_wide) trainRowNumbers <- createDataPartition(as.factor(biofilm_data_wide$indicator), p=0.8, list=FALSE,times=1) # list: no list is returned # times: you can do multiple times # if the output is a factor splitting is done so that the the outcome variable is equally represented # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 2: Create the training dataset ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ trainData <- biofilm_data_wide[trainRowNumbers,] # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ## Step 3: Create the test dataset ---- # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ testData <- biofilm_data_wide[-trainRowNumbers,] # What are the frequencies? table(biofilm_data$indicator)/length(biofilm_data$indicator) table(trainData$indicator)/length(trainData$indicator) # The outcome variable Y for later use. y = trainData$indicator