Commit 218cb8ec authored by Johannes Bleher's avatar Johannes Bleher
Browse files

Data Literature and ASsignment added

parents
Loading
Loading
Loading
Loading
+582 KiB

File added.

No diff preview for this file type.

+0 −0

File added.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+0 −0

File added.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

01_data/biofilm.Rdata

0 → 100644
+20.1 MiB

File added.

No diff preview for this file type.

+57 −0
Original line number Diff line number Diff line
# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls()) 


library("caret")
library("data.table")

# Create the training and test datasets
set.seed(100)

load(file="01_data/biofilm.Rdata")
setDT(biofilm_data)
#Descriptive: There are several files in the dataset
file_list <- unique(biofilm_data$file)

# Let's get one out of it
onefile <- biofilm_data[file_list[2]==biofilm_data$file,]
# Let's graph the mz values against the counts
plot(onefile$mz,onefile$int,pch=16,type="h")

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Splitting based on outcome variables ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 1: Get row numbers for the training data ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int")
biofilm_data_wide <- biofilm_data_wide[,-c("file")]
pcas <- prcomp(biofilm_data_wide)


trainRowNumbers <- createDataPartition(biofilm_data$indicator, p=0.8, list=FALSE,times=1)
# list: no list is returned
# times: you can do multiple times
# if the output is a factor splitting is done so that the the outcome variable is equally represented
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 2: Create the training  dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
trainData <- biofilm_data[trainRowNumbers,]

# What are the frequencies?
table(biofilm_data$indicator)/length(biofilm_data$indicator)
table(trainData$indicator)/length(trainData$indicator)

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 3: Create the test dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
testData <- biofilm_data[-trainRowNumbers,]

# The outcome variable Y for later use.
y = trainData$indicator