Some changes (be4f31a3) · Commits · Johannes Bleher / AIDAHO_IDS_AS7

02_code/R/230620_JB_AssignmentSolution.R

deleted100644 → 0

+0 −65

Original line number	Diff line number	Diff line
		# Clear workspace and graphs
		if(!is.null(dev.list())) dev.off()
		rm(list = ls())


		library("caret")
		library("data.table")

		# Create the training and test datasets
		set.seed(100)

		load(file="01_data/biofilm.Rdata")
		setDT(biofilm_data)
		#Descriptive: There are several files in the dataset
		file_list <- unique(biofilm_data$file)

		# Let's get one out of it
		onefile <- biofilm_data[file_list[2]==biofilm_data$file,]
		# Let's graph the mz values against the counts
		plot(onefile$mz,onefile$int,pch=16,type="h")

		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		# Splitting based on outcome variables ----
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		## Step 1: Get row numbers for the training data ----
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		biofilm_data$mz <- round(biofilm_data$mz,digits=1)
		setkeyv(biofilm_data,cols=c("file","indicator","mz"))
		# This step takes a long time
		biofilm_data[,.(int=sum(int)),
		by=c("file","indicator","mz")]


		biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz,value.var = "int")
		biofilm_data_wide <- biofilm_data_wide[,-c("file")]
		pcas <- prcomp(biofilm_data_wide)


		trainRowNumbers <- createDataPartition(as.factor(biofilm_data_wide$indicator), p=0.8, list=FALSE,times=1)
		# list: no list is returned
		# times: you can do multiple times
		# if the output is a factor splitting is done so that the the outcome variable is equally represented
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		## Step 2: Create the training dataset ----
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		trainData <- biofilm_data_wide[trainRowNumbers,]

		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		## Step 3: Create the test dataset ----
		# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
		testData <- biofilm_data_wide[-trainRowNumbers,]


		# What are the frequencies?
		table(biofilm_data$indicator)/length(biofilm_data$indicator)
		table(trainData$indicator)/length(trainData$indicator)

		# The outcome variable Y for later use.
		y = trainData$indicator