Commit 6b146b8b authored by Johannes Bleher's avatar Johannes Bleher
Browse files

New assignment

parent 218cb8ec
Loading
Loading
Loading
Loading

02_code/R/.gitignore

0 → 100644
+1 −0
Original line number Diff line number Diff line
230620_JB_AssignmentSolution.R
+15 −7
Original line number Diff line number Diff line
@@ -25,28 +25,36 @@ plot(onefile$mz,onefile$int,pch=16,type="h")
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 1: Get row numbers for the training data ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
biofilm_data$mz <- round(biofilm_data$mz,digits=1)
setkeyv(biofilm_data,cols=c("file","indicator","mz"))
# This step takes a long time
biofilm_data[,.(int=sum(int)),
             by=c("file","indicator","mz")]


biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int")
biofilm_data_wide <- biofilm_data_wide[,-c("file")]
pcas <- prcomp(biofilm_data_wide)


trainRowNumbers <- createDataPartition(biofilm_data$indicator, p=0.8, list=FALSE,times=1)
trainRowNumbers <- createDataPartition(as.factor(biofilm_data_wide$indicator), p=0.8, list=FALSE,times=1)
# list: no list is returned
# times: you can do multiple times
# if the output is a factor splitting is done so that the the outcome variable is equally represented
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 2: Create the training  dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
trainData <- biofilm_data[trainRowNumbers,]

# What are the frequencies?
table(biofilm_data$indicator)/length(biofilm_data$indicator)
table(trainData$indicator)/length(trainData$indicator)
trainData <- biofilm_data_wide[trainRowNumbers,]

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Step 3: Create the test dataset ----
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
testData <- biofilm_data[-trainRowNumbers,]
testData <- biofilm_data_wide[-trainRowNumbers,]


# What are the frequencies?
table(biofilm_data$indicator)/length(biofilm_data$indicator)
table(trainData$indicator)/length(trainData$indicator)

# The outcome variable Y for later use.
y = trainData$indicator

03_report/Tex/.ass7.tex.kate-swp

deleted100644 → 0
−1.19 KiB

File deleted.

+11 −0
Original line number Diff line number Diff line
@Comment{$ biblatex control file $}
@Comment{$ biblatex bcf format version 3.10 $}
% Do not modify this file!
%
% This is an auxiliary file used by the 'biblatex' package.
% This file may safely be deleted. It will be recreated as
% required.

@Control{biblatex-control,
  options = {3.10:0:1:1:0:1:1:0:0:1:0:1:3:1:3:1:0:0:3:1:79:+:+:nty},
}

03_report/Tex/ass7.aux

0 → 100644
+28 −0
Original line number Diff line number Diff line
\relax 
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax 
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\bibstyle{biblatex}
\bibdata{ass7-blx}
\citation{biblatex-control}
\abx@aux@refcontext{nty/global//global/global}
\@writefile{toc}{\contentsline {section}{\numberline {Task 1:}Clone a git repository}{1}{section.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {Task 2:}The \texttt  {caret} package}{1}{section.2}\protected@file@percent }
\newlabel{step:extract}{{5}{2}{Getting to know the data}{Item.7}{}}
\newlabel{step:plot}{{6}{2}{Getting to know the data}{Item.8}{}}
\newlabel{split:step1}{{13}{2}{Splitting the data set based on the outcome variable}{Item.15}{}}
\newlabel{split:step2}{{14}{2}{Splitting the data set based on the outcome variable}{Item.16}{}}
\newlabel{split:freq}{{15}{2}{Splitting the data set based on the outcome variable}{Item.17}{}}
\abx@aux@read@bbl@mdfivesum{242E9DB92557AC81F6B30CE4F9334513}
\gdef \@abspage@last{2}
Loading