Commit be4f31a3 authored by Johannes Bleher's avatar Johannes Bleher
Browse files

Some changes

parent 6b146b8b
Loading
Loading
Loading
Loading
+0 −65
Original line number Diff line number Diff line
# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls()) 


library("caret")
library("data.table")

# Create the training and test datasets
set.seed(100)

load(file="01_data/biofilm.Rdata")
setDT(biofilm_data)
#Descriptive: There are several files in the dataset
file_list <- unique(biofilm_data$file)

# Let's get one out of it
onefile <- biofilm_data[file_list[2]==biofilm_data$file,]
# Let's graph the mz values against the counts
plot(onefile$mz,onefile$int,pch=16,type="h")

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Splitting based on outcome variables ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 1: Get row numbers for the training data ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
biofilm_data$mz <- round(biofilm_data$mz,digits=1)
setkeyv(biofilm_data,cols=c("file","indicator","mz"))
# This step takes a long time
biofilm_data[,.(int=sum(int)),
             by=c("file","indicator","mz")]


biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int")
biofilm_data_wide <- biofilm_data_wide[,-c("file")]
pcas <- prcomp(biofilm_data_wide)


trainRowNumbers <- createDataPartition(as.factor(biofilm_data_wide$indicator), p=0.8, list=FALSE,times=1)
# list: no list is returned
# times: you can do multiple times
# if the output is a factor splitting is done so that the the outcome variable is equally represented
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 2: Create the training  dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
trainData <- biofilm_data_wide[trainRowNumbers,]

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 3: Create the test dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
testData <- biofilm_data_wide[-trainRowNumbers,]


# What are the frequencies?
table(biofilm_data$indicator)/length(biofilm_data$indicator)
table(trainData$indicator)/length(trainData$indicator)

# The outcome variable Y for later use.
y = trainData$indicator