Commit 48bce28a authored by jbleher's avatar jbleher
Browse files

delete file

parent 33348827
Loading
Loading
Loading
Loading
+0 −190
Original line number Diff line number Diff line
###############################################################
#
###############################################################

# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls()) 

library("RaMS")
library("dplyr")
library("Matrix")
library("wordspace")
library("data.table")
library("caret")
library("doParallel")

ncores <- 2
cl <- makeCluster(ncores)
registerDoParallel(cl)

###############################################################
# Convert data ----
###############################################################


baseline <- grabMzmlData(
    filename="./01_data/2022-06-28-Blank_L7modified_HILIC_10.mzML",
    grab_what="everything",
    verbosity = 0,
    mz = NULL,
    ppm = NULL,
    rtrange = NULL,
    prefilter = -1
)

save(baseline,file="./01_data/baseline.Rdata")

thedata <- grabMzmlData(
    filename="./01_data/2022-06-28-Rktlsg3_L7modified_HILIC_6.mzML",
    grab_what="everything",
    verbosity = 0,
    mz = NULL,
    ppm = NULL,
    rtrange = NULL,
    prefilter = -1
)

save(thedata,file="./01_data/thedata.Rdata")


###############################################################
# Preprocess data ----
###############################################################

precision <- 1


bl.MS1 <- baseline$MS1

# make time a 6 digit number
bl.MS1[,time := round(rt,6)]
# make mz a 3 digit number
bl.MS1[,mz_int := round(mz,precision)]

#keep integers in MS1
bl.MS1 <- bl.MS1[int!=0,c("time","mz_int","int")]

bl.MS1 <- bl.MS1[order(mz_int),]



MS1 <- thedata$MS1

# make time a 6 digit number
MS1[,time := round(rt,6)]
# make mz a 3 digit number
MS1[,mz_int := round(mz,precision)]

#keep integers in MS1
MS1 <- MS1[int!=0,c("time","mz_int","int")]

MS1 <- MS1[order(mz_int),]

bl.MS1[,lag_time := time-5]


###############################################################
# Throw-out baseline ----
###############################################################
# Check out whether the mz_values are in the baseline (non-equi join)
MS1[, in_baseline:= 
        bl.MS1[MS1, #X[i] merge baseline with MS1 on (see next row)
               # the on statement works like this X.column == i.column.Value
               on=.(mz_int == mz_int,lag_time <= time, time <= time)
               , .N, # Count the matches for each matched group (i.e. on key)
               , by=.EACHI][,N>0L]]

# Only use those observations that are NOT present in the baseline
MS1 <- MS1[in_baseline==FALSE,]
    
###############################################################
# Locality sensitive hash functions ----
###############################################################    
# set the number of hash functions
hash.function.count <- 100 
  
#Now we get random vectors (i.e. hash functions)
rr.col.names <- paste0("rr_vv",1:hash.function.count)

mz_levels <- unique(MS1$mz_int)
idx_mz_levels <- 1:length(mz_levels)

mz_levels.df <- setDT(data.frame(idx_mz_levels,mz_int=mz_levels))
mz_levels.df[, (rr.col.names) := lapply(1:hash.function.count, function(x) {
    uu <- rnorm(length(mz_levels))
    zz <- (mz_int)#-mean(mz_int))/sd(mz_int)
    return(zz*uu)
})]
cols <- c("mz_int",rr.col.names)
MS1_lj <- left_join(MS1,mz_levels.df[,..cols],by="mz_int")

# aggregate per hashfunction
MS1.aggr <- MS1_lj[,lapply(.SD,function(x) max(sign(sum(x)),0)), .SDcols=rr.col.names, by=time]

# Make hash functions sparse
MS1.hash.mat <- Matrix(as.matrix(MS1.aggr[,-1]),sparse=TRUE)

# projection <- MS1.hash.mat%*%(2^(1:hash.function.count))
# 
DD <- dist.matrix(MS1.hash.mat,method="cosine"#"cosine" #"jaccard"
)


fit <- hclust(as.dist(DD), method="ward.D2") 
plot(fit) # display dendogram

#number of groups 
kk = 2
groups <- cutree(fit, k=kk) # cut tree into 4 clusters
plot(MS1.aggr$time/10^6,groups)

group_times <- lapply(1:kk,function(x){
    group <- MS1.aggr$time*(groups==x)
    group <- group[which(group != 0)] 
})
max_val <- max(unlist(lapply(lapply(group_times,density,bw="SJ",kernel="epanechnikov"),function(x) x$y)))
plot(density(group_times[[1]],bw="SJ",kernel="epanechnikov"),ylim=c(0,max_val),
     main="Density of categorizations")
for(jj in 2:kk){
    lines(density(group_times[[jj]],bw="SJ",kernel="epanechnikov"),col=colors()[200+jj])
}

MS1_wide <- dcast(MS1[,-c("in_baseline")], time ~ mz_int, value.var = "int")
MS1_wide$group <- as.factor(groups)


trainIndex <- createDataPartition(MS1_wide$group,list=FALSE,p=0.8)

trainSet <- MS1_wide[trainIndex,-c("time")]
validSet <- MS1_wide[-trainIndex,-c("time")]

ctrl <- trainControl( method = "cv" ,
                      verbose=TRUE)

tuneGrid = expand.grid(alpha = seq(0,1,0.1), 
                       lambda = c(seq(0.1, 2, by =0.1) ,  seq(2, 5, 0.5) , seq(5, 25, 1))
                       ) 

elastic.net <-train(y= as.numeric(trainSet$group)-1,
             x=trainSet[,-c("group")],
             method = 'glmnet',
             preProcess = c("center","scale"),
             trControl = ctrl ,
             tuneGrid = tuneGrid,
             metric =  "Rsquared"
) 

predictions_elastic.net <- elastic.net %>% predict(validSet)


# Print R squared scores
plot(data.frame(predictions_elastic.net, as.numeric(validSet$group)-1))
R2(predictions_elastic.net, as.numeric(validSet$group)-1)
RMSE(predictions_elastic.net, as.numeric(validSet$group)-1)

# Print coefficients
bestTuneModelCoefs <- as.data.frame.matrix(coef(elastic.net$finalModel, elastic.net$bestTune$lambda))
bestTuneModelCoefs$mz_vals <- as.numeric(rownames(bestTuneModelCoefs))

bestTuneModelCoefs[head(order(bestTuneModelCoefs$s1,decreasing=TRUE),10),]

02_code/R/230621_JB_InclassAS7.R

deleted100644 → 0
+0 −38
Original line number Diff line number Diff line
# Generate the probabilities of default
# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls())

library("caret")
library("data.table")

load("01_data/biofilm.Rdata")

setDT(biofilm_data)

names(biofilm_data)

thefiles <- unique(biofilm_data$file)

onefile <- biofilm_data[thefiles[30]==file,]

plot(onefile$mz,onefile$int,type="h")

biofilm_data[,mz:=round(mz,1)]

setkeyv(biofilm_data,cols=c("file","indicator","mz"))

biofilm_data[,.(int=sum(int)),by=c("file","indicator","mz")]

biofilm_data_wide <- dcast(biofilm_data, file + indicator ~ mz, value.var = "int")

trainIndex <- createDataPartition(as.factor(biofilm_data_wide$indicator),list=FALSE,p=0.8)

trainSet <- biofilm_data_wide[trainIndex,]
validSet <- biofilm_data_wide[-trainIndex,]

table(trainSet$indicator)/length(trainSet$indicator)
table(validSet$indicator)/length(validSet$indicator)


save(biofilm_data_wide,validSet,trainSet,file="01_data/ML_data.Rdata")
+0 −47
Original line number Diff line number Diff line
# Generate the probabilities of default
# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls())

library("caret")
library("data.table")


load(file="01_data/ML_data.Rdata")


XX <- subset(trainSet,select=-c(file,indicator))

pcs <- prcomp(XX,center=TRUE,scale=TRUE)

cc <- pcs$sdev^2/sum(pcs$sdev^2)

plot(cc,type="h",xlab="Number of PC", ylab="Contribution")

components <- pcs$x[,1:2]

plot(components,pch=16,col=as.factor(trainSet$indicator))

yy_pcs <- cbind(trainSet$indicator,components)
colnames(yy_pcs) <- c("indicator","PC1","PC2")

ctrl <- trainControl( method = "repeatedcv" ,
                       repeats = 3 ,
                       number =10)
knnFit <- train ( as.factor(indicator ) ~ . ,
                  data = yy_pcs ,
                  method = "knn" ,
                  trControl = ctrl ,
                  preProcess = c("center" ,"scale") ,
                  tuneLength = 10)


# Model evaluation

XX_valid <- subset(validSet,select = -c(file,indicator))

pcs_new <- predict(pcs,newdata = XX_valid)

pred_age <- predict(knnFit,newdata=pcs_new)

prd_outcome_comparison <- cbind(as.numeric(pred_age)-1,validSet$indicator)
+0 −43
Original line number Diff line number Diff line
# Generate the probabilities of default
# Clear workspace and graphs
if(!is.null(dev.list())) dev.off()
rm(list = ls())

library("caret")
library("data.table")

load(file="01_data/ML_data.Rdata")


train_preProcess <- preProcess(trainSet[,-c("indicator","file")],method=c("center","scale"))
valid_preProcess <- preProcess(validSet[,-c("indicator","file")],method=c("center","scale"))


trainSet_stdz <- predict(train_preProcess,trainSet[,-c("file")])
validSet_stdz <- predict(valid_preProcess,validSet[,-c("file")])


ctrl <- trainControl(method="cv",number=10,verbose=TRUE)

tuneGrid <- expand.grid( alpha= 1, lambda = seq(0.1,1,0.1))

lasso_fit <- train(y = as.factor(trainSet_stdz$indicator),
      x = trainSet_stdz[,-c("indicator")],
      method="glmnet",
      metric="Accuracy",
      trControl = ctrl,
      tuneGrid = tuneGrid)


pred <- predict(lasso_fit,validSet_stdz)
# Alternative:
#predictions_lasso <- lasso_fit %>% predict(validSet) 


bestTuneModelCoefs <- as.data.frame.matrix(coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda))

bestTuneModelCoefs$mz_vals <- as.numeric(rownames(bestTuneModelCoefs))

bestTuneModelCoefs[head(order(bestTuneModelCoefs$s1,decreasing=TRUE),10),]

03_report/Tex/ass7_p2.aux

deleted100644 → 0
+0 −26
Original line number Diff line number Diff line
\relax 
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax 
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\bibstyle{biblatex}
\bibdata{ass7_p2-blx,lib}
\citation{biblatex-control}
\abx@aux@refcontext{nty/global//global/global}
\@writefile{toc}{\contentsline {section}{\numberline {Task 1:}The corresponding git repository}{1}{section.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {Task 2:}Classification with Principle Components and kNN}{1}{section.2}\protected@file@percent }
\newlabel{eq:pca_scorings}{{2}{2}{Principal Component Analysis}{equation.2.2}{}}
\newlabel{step:PCA}{{3}{2}{Model fitting on training data}{Item.3}{}}
\newlabel{step:kNN}{{7}{2}{Model fitting on training data}{Item.7}{}}
\abx@aux@read@bbl@mdfivesum{242E9DB92557AC81F6B30CE4F9334513}
\gdef \@abspage@last{3}
Loading