Commit b30f86c9 authored by Johannes Bleher's avatar Johannes Bleher
Browse files

Neue Daten

parent fd265be9
Loading
Loading
Loading
Loading

01_data/ML_data.Rdata

0 → 100644
+16.9 MiB

File added.

No diff preview for this file type.

01_data/baseline.Rdata

0 → 100644
+57.6 MiB

File added.

No diff preview for this file type.

01_data/thedata.Rdata

0 → 100644
+58.9 MiB

File added.

No diff preview for this file type.

+190 −0
Original line number Diff line number Diff line
###############################################################
#
###############################################################

# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls()) 

library("RaMS")
library("dplyr")
library("Matrix")
library("wordspace")
library("data.table")
library("caret")
library("doParallel")

ncores <- 2
cl <- makeCluster(ncores)
registerDoParallel(cl)

###############################################################
# Convert data ----
###############################################################


baseline <- grabMzmlData(
    filename="./01_data/2022-06-28-Blank_L7modified_HILIC_10.mzML",
    grab_what="everything",
    verbosity = 0,
    mz = NULL,
    ppm = NULL,
    rtrange = NULL,
    prefilter = -1
)

save(baseline,file="./01_data/baseline.Rdata")

thedata <- grabMzmlData(
    filename="./01_data/2022-06-28-Rktlsg3_L7modified_HILIC_6.mzML",
    grab_what="everything",
    verbosity = 0,
    mz = NULL,
    ppm = NULL,
    rtrange = NULL,
    prefilter = -1
)

save(thedata,file="./01_data/thedata.Rdata")


###############################################################
# Preprocess data ----
###############################################################

precision <- 1


bl.MS1 <- baseline$MS1

# make time a 6 digit number
bl.MS1[,time := round(rt,6)]
# make mz a 3 digit number
bl.MS1[,mz_int := round(mz,precision)]

#keep integers in MS1
bl.MS1 <- bl.MS1[int!=0,c("time","mz_int","int")]

bl.MS1 <- bl.MS1[order(mz_int),]



MS1 <- thedata$MS1

# make time a 6 digit number
MS1[,time := round(rt,6)]
# make mz a 3 digit number
MS1[,mz_int := round(mz,precision)]

#keep integers in MS1
MS1 <- MS1[int!=0,c("time","mz_int","int")]

MS1 <- MS1[order(mz_int),]

bl.MS1[,lag_time := time-5]


###############################################################
# Throw-out baseline ----
###############################################################
# Check out whether the mz_values are in the baseline (non-equi join)
MS1[, in_baseline:= 
        bl.MS1[MS1, #X[i] merge baseline with MS1 on (see next row)
               # the on statement works like this X.column == i.column.Value
               on=.(mz_int == mz_int,lag_time <= time, time <= time)
               , .N, # Count the matches for each matched group (i.e. on key)
               , by=.EACHI][,N>0L]]

# Only use those observations that are NOT present in the baseline
MS1 <- MS1[in_baseline==FALSE,]
    
###############################################################
# Locality sensitive hash functions ----
###############################################################    
# set the number of hash functions
hash.function.count <- 100 
  
#Now we get random vectors (i.e. hash functions)
rr.col.names <- paste0("rr_vv",1:hash.function.count)

mz_levels <- unique(MS1$mz_int)
idx_mz_levels <- 1:length(mz_levels)

mz_levels.df <- setDT(data.frame(idx_mz_levels,mz_int=mz_levels))
mz_levels.df[, (rr.col.names) := lapply(1:hash.function.count, function(x) {
    uu <- rnorm(length(mz_levels))
    zz <- (mz_int)#-mean(mz_int))/sd(mz_int)
    return(zz*uu)
})]
cols <- c("mz_int",rr.col.names)
MS1_lj <- left_join(MS1,mz_levels.df[,..cols],by="mz_int")

# aggregate per hashfunction
MS1.aggr <- MS1_lj[,lapply(.SD,function(x) max(sign(sum(x)),0)), .SDcols=rr.col.names, by=time]

# Make hash functions sparse
MS1.hash.mat <- Matrix(as.matrix(MS1.aggr[,-1]),sparse=TRUE)

# projection <- MS1.hash.mat%*%(2^(1:hash.function.count))
# 
DD <- dist.matrix(MS1.hash.mat,method="cosine"#"cosine" #"jaccard"
)


fit <- hclust(as.dist(DD), method="ward.D2") 
plot(fit) # display dendogram

#number of groups 
kk = 2
groups <- cutree(fit, k=kk) # cut tree into 4 clusters
plot(MS1.aggr$time/10^6,groups)

group_times <- lapply(1:kk,function(x){
    group <- MS1.aggr$time*(groups==x)
    group <- group[which(group != 0)] 
})
max_val <- max(unlist(lapply(lapply(group_times,density,bw="SJ",kernel="epanechnikov"),function(x) x$y)))
plot(density(group_times[[1]],bw="SJ",kernel="epanechnikov"),ylim=c(0,max_val),
     main="Density of categorizations")
for(jj in 2:kk){
    lines(density(group_times[[jj]],bw="SJ",kernel="epanechnikov"),col=colors()[200+jj])
}

MS1_wide <- dcast(MS1[,-c("in_baseline")], time ~ mz_int, value.var = "int")
MS1_wide$group <- as.factor(groups)


trainIndex <- createDataPartition(MS1_wide$group,list=FALSE,p=0.8)

trainSet <- MS1_wide[trainIndex,-c("time")]
validSet <- MS1_wide[-trainIndex,-c("time")]

ctrl <- trainControl( method = "cv" ,
                      verbose=TRUE)

tuneGrid = expand.grid(alpha = seq(0,1,0.1), 
                       lambda = c(seq(0.1, 2, by =0.1) ,  seq(2, 5, 0.5) , seq(5, 25, 1))
                       ) 

elastic.net <-train(y= as.numeric(trainSet$group)-1,
             x=trainSet[,-c("group")],
             method = 'glmnet',
             preProcess = c("center","scale"),
             trControl = ctrl ,
             tuneGrid = tuneGrid,
             metric =  "Rsquared"
) 

predictions_elastic.net <- elastic.net %>% predict(validSet)


# Print R squared scores
plot(data.frame(predictions_elastic.net, as.numeric(validSet$group)-1))
R2(predictions_elastic.net, as.numeric(validSet$group)-1)
RMSE(predictions_elastic.net, as.numeric(validSet$group)-1)

# Print coefficients
bestTuneModelCoefs <- as.data.frame.matrix(coef(elastic.net$finalModel, elastic.net$bestTune$lambda))
bestTuneModelCoefs$mz_vals <- as.numeric(rownames(bestTuneModelCoefs))

bestTuneModelCoefs[head(order(bestTuneModelCoefs$s1,decreasing=TRUE),10),]
+8 −2
Original line number Diff line number Diff line
@@ -26,7 +26,13 @@ biofilm_data[,.(int=sum(int)),by=c("file","indicator","mz")]

biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz, value.var = "int")

trainIndex <- createDataPartition(biofilm_data_wide$indicator,list=FALSE,p=0.8)
trainIndex <- createDataPartition(as.factor(biofilm_data_wide$indicator),list=FALSE,p=0.8)

trainSet <- biofilm_data_wide[trainIndex,]
testSet <- biofilm_data_wide[-trainIndex,]
validSet <- biofilm_data_wide[-trainIndex,]

table(trainSet$indicator)/length(trainSet$indicator)
table(validSet$indicator)/length(validSet$indicator)


save(biofilm_data_wide,validSet,trainSet,file="01_data/ML_data.Rdata")
Loading