delete file (48bce28a) · Commits · Johannes Bleher / AIDAHO_IDS_AS7

02_code/R/230309_JB_baseline_substraction.R

deleted100644 → 0

+0 −190

Original line number	Diff line number	Diff line
		###############################################################
		#
		###############################################################

		# Clear workspace and graphs
		if(!is.null(dev.list())) dev.off()
		rm(list = ls())

		library("RaMS")
		library("dplyr")
		library("Matrix")
		library("wordspace")
		library("data.table")
		library("caret")
		library("doParallel")

		ncores <- 2
		cl <- makeCluster(ncores)
		registerDoParallel(cl)

		###############################################################
		# Convert data ----
		###############################################################


		baseline <- grabMzmlData(
		filename="./01_data/2022-06-28-Blank_L7modified_HILIC_10.mzML",
		grab_what="everything",
		verbosity = 0,
		mz = NULL,
		ppm = NULL,
		rtrange = NULL,
		prefilter = -1
		)

		save(baseline,file="./01_data/baseline.Rdata")

		thedata <- grabMzmlData(
		filename="./01_data/2022-06-28-Rktlsg3_L7modified_HILIC_6.mzML",
		grab_what="everything",
		verbosity = 0,
		mz = NULL,
		ppm = NULL,
		rtrange = NULL,
		prefilter = -1
		)

		save(thedata,file="./01_data/thedata.Rdata")


		###############################################################
		# Preprocess data ----
		###############################################################

		precision <- 1


		bl.MS1 <- baseline$MS1

		# make time a 6 digit number
		bl.MS1[,time := round(rt,6)]
		# make mz a 3 digit number
		bl.MS1[,mz_int := round(mz,precision)]

		#keep integers in MS1
		bl.MS1 <- bl.MS1[int!=0,c("time","mz_int","int")]

		bl.MS1 <- bl.MS1[order(mz_int),]



		MS1 <- thedata$MS1

		# make time a 6 digit number
		MS1[,time := round(rt,6)]
		# make mz a 3 digit number
		MS1[,mz_int := round(mz,precision)]

		#keep integers in MS1
		MS1 <- MS1[int!=0,c("time","mz_int","int")]

		MS1 <- MS1[order(mz_int),]

		bl.MS1[,lag_time := time-5]


		###############################################################
		# Throw-out baseline ----
		###############################################################
		# Check out whether the mz_values are in the baseline (non-equi join)
		MS1[, in_baseline:=
		bl.MS1[MS1, #X[i] merge baseline with MS1 on (see next row)
		# the on statement works like this X.column == i.column.Value
		on=.(mz_int == mz_int,lag_time <= time, time <= time)
		, .N, # Count the matches for each matched group (i.e. on key)
		, by=.EACHI][,N>0L]]

		# Only use those observations that are NOT present in the baseline
		MS1 <- MS1[in_baseline==FALSE,]

		###############################################################
		# Locality sensitive hash functions ----
		###############################################################
		# set the number of hash functions
		hash.function.count <- 100

		#Now we get random vectors (i.e. hash functions)
		rr.col.names <- paste0("rr_vv",1:hash.function.count)

		mz_levels <- unique(MS1$mz_int)
		idx_mz_levels <- 1:length(mz_levels)

		mz_levels.df <- setDT(data.frame(idx_mz_levels,mz_int=mz_levels))
		mz_levels.df[, (rr.col.names) := lapply(1:hash.function.count, function(x) {
		uu <- rnorm(length(mz_levels))
		zz <- (mz_int)#-mean(mz_int))/sd(mz_int)
		return(zz*uu)
		})]
		cols <- c("mz_int",rr.col.names)
		MS1_lj <- left_join(MS1,mz_levels.df[,..cols],by="mz_int")

		# aggregate per hashfunction
		MS1.aggr <- MS1_lj[,lapply(.SD,function(x) max(sign(sum(x)),0)), .SDcols=rr.col.names, by=time]

		# Make hash functions sparse
		MS1.hash.mat <- Matrix(as.matrix(MS1.aggr[,-1]),sparse=TRUE)

		# projection <- MS1.hash.mat%*%(2^(1:hash.function.count))
		#
		DD <- dist.matrix(MS1.hash.mat,method="cosine"#"cosine" #"jaccard"
		)


		fit <- hclust(as.dist(DD), method="ward.D2")
		plot(fit) # display dendogram

		#number of groups
		kk = 2
		groups <- cutree(fit, k=kk) # cut tree into 4 clusters
		plot(MS1.aggr$time/10^6,groups)

		group_times <- lapply(1:kk,function(x){
		group <- MS1.aggr$time*(groups==x)
		group <- group[which(group != 0)]
		})
		max_val <- max(unlist(lapply(lapply(group_times,density,bw="SJ",kernel="epanechnikov"),function(x) x$y)))
		plot(density(group_times[[1]],bw="SJ",kernel="epanechnikov"),ylim=c(0,max_val),
		main="Density of categorizations")
		for(jj in 2:kk){
		lines(density(group_times[[jj]],bw="SJ",kernel="epanechnikov"),col=colors()[200+jj])
		}

		MS1_wide <- dcast(MS1[,-c("in_baseline")], time ~ mz_int, value.var = "int")
		MS1_wide$group <- as.factor(groups)


		trainIndex <- createDataPartition(MS1_wide$group,list=FALSE,p=0.8)

		trainSet <- MS1_wide[trainIndex,-c("time")]
		validSet <- MS1_wide[-trainIndex,-c("time")]

		ctrl <- trainControl( method = "cv" ,
		verbose=TRUE)

		tuneGrid = expand.grid(alpha = seq(0,1,0.1),
		lambda = c(seq(0.1, 2, by =0.1) , seq(2, 5, 0.5) , seq(5, 25, 1))
		)

		elastic.net <-train(y= as.numeric(trainSet$group)-1,
		x=trainSet[,-c("group")],
		method = 'glmnet',
		preProcess = c("center","scale"),
		trControl = ctrl ,
		tuneGrid = tuneGrid,
		metric = "Rsquared"
		)

		predictions_elastic.net <- elastic.net %>% predict(validSet)


		# Print R squared scores
		plot(data.frame(predictions_elastic.net, as.numeric(validSet$group)-1))
		R2(predictions_elastic.net, as.numeric(validSet$group)-1)
		RMSE(predictions_elastic.net, as.numeric(validSet$group)-1)

		# Print coefficients
		bestTuneModelCoefs <- as.data.frame.matrix(coef(elastic.net$finalModel, elastic.net$bestTune$lambda))
		bestTuneModelCoefs$mz_vals <- as.numeric(rownames(bestTuneModelCoefs))

		bestTuneModelCoefs[head(order(bestTuneModelCoefs$s1,decreasing=TRUE),10),]

02_code/R/230621_JB_InclassAS7.R

deleted100644 → 0

+0 −38

Original line number	Diff line number	Diff line
		# Generate the probabilities of default
		# Clear workspace and graphs
		if(!is.null(dev.list())) dev.off()
		rm(list = ls())

		library("caret")
		library("data.table")

		load("01_data/biofilm.Rdata")

		setDT(biofilm_data)

		names(biofilm_data)

		thefiles <- unique(biofilm_data$file)

		onefile <- biofilm_data[thefiles[30]==file,]

		plot(onefile$mz,onefile$int,type="h")

		biofilm_data[,mz:=round(mz,1)]

		setkeyv(biofilm_data,cols=c("file","indicator","mz"))

		biofilm_data[,.(int=sum(int)),by=c("file","indicator","mz")]

		biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz, value.var = "int")

		trainIndex <- createDataPartition(as.factor(biofilm_data_wide$indicator),list=FALSE,p=0.8)

		trainSet <- biofilm_data_wide[trainIndex,]
		validSet <- biofilm_data_wide[-trainIndex,]

		table(trainSet$indicator)/length(trainSet$indicator)
		table(validSet$indicator)/length(validSet$indicator)


		save(biofilm_data_wide,validSet,trainSet,file="01_data/ML_data.Rdata")

02_code/R/230628_JB_InclassAS7_p2.R

deleted100644 → 0

+0 −47

Original line number	Diff line number	Diff line
		# Generate the probabilities of default
		# Clear workspace and graphs
		if(!is.null(dev.list())) dev.off()
		rm(list = ls())

		library("caret")
		library("data.table")


		load(file="01_data/ML_data.Rdata")


		XX <- subset(trainSet,select=-c(file,indicator))

		pcs <- prcomp(XX,center=TRUE,scale=TRUE)

		cc <- pcs$sdev^2/sum(pcs$sdev^2)

		plot(cc,type="h",xlab="Number of PC", ylab="Contribution")

		components <- pcs$x[,1:2]

		plot(components,pch=16,col=as.factor(trainSet$indicator))

		yy_pcs <- cbind(trainSet$indicator,components)
		colnames(yy_pcs) <- c("indicator","PC1","PC2")

		ctrl <- trainControl( method = "repeatedcv" ,
		repeats = 3 ,
		number =10)
		knnFit <- train ( as.factor(indicator ) ~ . ,
		data = yy_pcs ,
		method = "knn" ,
		trControl = ctrl ,
		preProcess = c("center" ,"scale") ,
		tuneLength = 10)


		# Model evaluation

		XX_valid <- subset(validSet,select = -c(file,indicator))

		pcs_new <- predict(pcs,newdata = XX_valid)

		pred_age <- predict(knnFit,newdata=pcs_new)

		prd_outcome_comparison <- cbind(as.numeric(pred_age)-1,validSet$indicator)

02_code/R/230705_JB_InclassAS7_p3.R

deleted100644 → 0

+0 −43

Original line number	Diff line number	Diff line
		# Generate the probabilities of default
		# Clear workspace and graphs
		if(!is.null(dev.list())) dev.off()
		rm(list = ls())

		library("caret")
		library("data.table")

		load(file="01_data/ML_data.Rdata")


		train_preProcess <- preProcess(trainSet[,-c("indicator","file")],method=c("center","scale"))
		valid_preProcess <- preProcess(validSet[,-c("indicator","file")],method=c("center","scale"))


		trainSet_stdz <- predict(train_preProcess,trainSet[,-c("file")])
		validSet_stdz <- predict(valid_preProcess,validSet[,-c("file")])


		ctrl <- trainControl(method="cv",number=10,verbose=TRUE)

		tuneGrid <- expand.grid( alpha= 1, lambda = seq(0.1,1,0.1))

		lasso_fit <- train(y = as.factor(trainSet_stdz$indicator),
		x = trainSet_stdz[,-c("indicator")],
		method="glmnet",
		metric="Accuracy",
		trControl = ctrl,
		tuneGrid = tuneGrid)


		pred <- predict(lasso_fit,validSet_stdz)
		# Alternative:
		#predictions_lasso <- lasso_fit %>% predict(validSet)


		bestTuneModelCoefs <- as.data.frame.matrix(coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda))

		bestTuneModelCoefs$mz_vals <- as.numeric(rownames(bestTuneModelCoefs))

		bestTuneModelCoefs[head(order(bestTuneModelCoefs$s1,decreasing=TRUE),10),]

03_report/Tex/ass7_p2.aux

deleted100644 → 0

+0 −26

Original line number	Diff line number	Diff line
		\relax
		\providecommand\hyper@newdestlabel[2]{}
		\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
		\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
		\global\let\oldnewlabel\newlabel
		\gdef\newlabel#1#2{\newlabelxx{#1}#2}
		\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
		\AtEndDocument{\ifx\hyper@anchor\@undefined
		\let\newlabel\oldnewlabel
		\fi}
		\fi}
		\global\let\hyper@last\relax
		\gdef\HyperFirstAtBeginDocument#1{#1}
		\providecommand\HyField@AuxAddToFields[1]{}
		\providecommand\HyField@AuxAddToCoFields[2]{}
		\bibstyle{biblatex}
		\bibdata{ass7_p2-blx,lib}
		\citation{biblatex-control}
		\abx@aux@refcontext{nty/global//global/global}
		\@writefile{toc}{\contentsline {section}{\numberline {Task 1:}The corresponding git repository}{1}{section.1}\protected@file@percent }
		\@writefile{toc}{\contentsline {section}{\numberline {Task 2:}Classification with Principle Components and kNN}{1}{section.2}\protected@file@percent }
		\newlabel{eq:pca_scorings}{{2}{2}{Principal Component Analysis}{equation.2.2}{}}
		\newlabel{step:PCA}{{3}{2}{Model fitting on training data}{Item.3}{}}
		\newlabel{step:kNN}{{7}{2}{Model fitting on training data}{Item.7}{}}
		\abx@aux@read@bbl@mdfivesum{242E9DB92557AC81F6B30CE4F9334513}
		\gdef \@abspage@last{3}