Loading 02_code/R/DataCleaning.R +23 −14 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored # necessary packages install.packages("hunspell") install.packages("wordcloud2") install.packages("tm") # setwd() # Required packages: hunspell, wordcloud2, tm, stringr library(stringr) library(hunspell) Loading @@ -19,7 +21,7 @@ library(tm) # Load the reviews load("01_data/raw/gamereviews.RData") View(game.rev) # TASK 3: CLEANING Loading Loading @@ -90,12 +92,8 @@ cat("Cleaned reviews:", nrow(reviews_final), "\n") cat("Saved to: 01_data/reviews_final.RData\n") # Create a vector with all words in the reviews all.words <- NULL all.words <- unlist(str_split(reviews.final$review, " ")) for (i in 1:nrow(reviews.final)){ tmp <- str_split(reviews.final$review[i], " ", simplify = TRUE) all.words <- c(all.words, tmp) } # sort the words in alphabetical order all.words <- sort(all.words) Loading Loading @@ -130,5 +128,16 @@ wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", back word.stems <- unlist(hunspell_stem(all.words.wof)) word.freq.stems <- data.frame(table(word.stems)) wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", backgroundColor = "black") wc_stem <- wordcloud2( data = word.freq.stems, size = 1.2, minSize = 3, color = "random-light", backgroundColor = "black" ) htmlwidgets::saveWidget( wc_stem, file = "03_report/wordcloud_stemmed.html", selfcontained = TRUE ) No newline at end of file 02_code/R/Game_Basic_Description.R +5 −2 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() # Load the Datasets Loading 02_code/R/task5_1_sentiment_bing_manual.R +6 −2 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() load("01_data/reviews_final.RData") Loading 02_code/R/task5_part2 & 3_ R package.R→02_code/R/task5_part2 & 3_syuzhet&nrc.R +8 −3 Original line number Diff line number Diff line rm(list=ls()) # install.packages("syuzhet") # Requires the 'syuzhet' package library(syuzhet) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() load("01_data/reviews_final_sentiment_manual.RData") Loading 02_code/R/task5_part4_Sentiment_analysis_SharedDeepSeekData.Rdeleted 100644 → 0 +0 −188 Original line number Diff line number Diff line graphics.off() rm(list = ls()) # ================================================= # Set working directory # ================================================= setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ================================================= # Load data (FULL DeepSeek) # ================================================= load("01_data/sentiment_deepseek_255710.RData") load("01_data/reviews_final_sentiment_manual.RData") load("01_data/reviews_final_sentiment_syuzhet.RData") # ================================================= # Extract LLM sentiment (CORRECT for this dataset) # ================================================= llm_sentiment_all <- sentiment.ds$sentiment # Sanity check str(llm_sentiment_all) summary(llm_sentiment_all) # ================================================= # Figure 1: LLM sentiment distribution (ALL reviews) # ================================================= sentiment_scores <- llm_sentiment_all par(mar = c(5, 4, 4, 2) + 0.1) hist( sentiment_scores, breaks = 30, freq = FALSE, col = "lightblue", border = "white", main = "Distribution of Sentiment Scores (All DeepSeek Reviews)", xlab = "Sentiment score (-1 = negative, +1 = positive)" ) lines( density(sentiment_scores, na.rm = TRUE), lwd = 2 ) # ================================================= # Align data for comparisons # ================================================= n_compare <- nrow(reviews_final) llm_sentiment <- tail(llm_sentiment_all, n_compare) # ================================================= # Figure 2: Manual Bing vs LLM # ================================================= manual_sentiment <- reviews_final$sentiment_manual keep <- is.finite(manual_sentiment) & is.finite(llm_sentiment) sum(keep) length(keep) par(mar = c(5, 5, 4, 2) + 0.1) plot( density(manual_sentiment[keep]), ylim = c(0, 4), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), main = "Manual Bing vs LLM Sentiment", xlab = "Sentiment score", ylab = "Density" ) lines( density(llm_sentiment[keep]), col = "black", lwd = 2 ) legend( "topleft", legend = c("Manual Bing", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) # ================================================= # Figure 3: syuzhet # ================================================= syuzhet <- reviews_final$sentiment_syuzhet par(mar = c(4, 4, 3, 1) + 0.1) plot( density(syuzhet, na.rm = TRUE), col = "darkgray", lwd = 2, main = "syuzhet Bing Sentiment (Raw)", xlab = "Raw sentiment score", ylab = "Density" ) # ================================================= # Figure 4: syuzhet vs LLM # ================================================= par(mar = c(5, 5, 4, 2) + 0.1) plot( density(syuzhet, na.rm = TRUE), ylim = c(0, 4), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), main = "syuzhet vs LLM Sentiment", xlab = "Sentiment score (-1 to +1)", ylab = "Density" ) lines( density(llm_sentiment), col = "black", lwd = 2 ) legend( "topleft", legend = c("syuzhet", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) # ================================================= # Figure 5: NRC vs LLM # ================================================= nrc_sentiment <- tail( reviews_final$sentiment_nrc, length(llm_sentiment) ) keep <- is.finite(nrc_sentiment) & is.finite(llm_sentiment) #pdf(file.path(output_path, "nrc_vs_llm_density.pdf"), # width = 7, height = 5) par(mar = c(5, 5, 4, 2) + 0.1) plot( density(nrc_sentiment[keep]), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), ylim = c(0, 4), main = "NRC Sentiment vs LLM Sentiment", xlab = "Sentiment score", ylab = "Density" ) lines( density(llm_sentiment[keep]), col = "black", lwd = 2, lty = 1 ) legend( "topleft", legend = c("NRC sentiment", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) dev.off() Loading
02_code/R/DataCleaning.R +23 −14 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored # necessary packages install.packages("hunspell") install.packages("wordcloud2") install.packages("tm") # setwd() # Required packages: hunspell, wordcloud2, tm, stringr library(stringr) library(hunspell) Loading @@ -19,7 +21,7 @@ library(tm) # Load the reviews load("01_data/raw/gamereviews.RData") View(game.rev) # TASK 3: CLEANING Loading Loading @@ -90,12 +92,8 @@ cat("Cleaned reviews:", nrow(reviews_final), "\n") cat("Saved to: 01_data/reviews_final.RData\n") # Create a vector with all words in the reviews all.words <- NULL all.words <- unlist(str_split(reviews.final$review, " ")) for (i in 1:nrow(reviews.final)){ tmp <- str_split(reviews.final$review[i], " ", simplify = TRUE) all.words <- c(all.words, tmp) } # sort the words in alphabetical order all.words <- sort(all.words) Loading Loading @@ -130,5 +128,16 @@ wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", back word.stems <- unlist(hunspell_stem(all.words.wof)) word.freq.stems <- data.frame(table(word.stems)) wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", backgroundColor = "black") wc_stem <- wordcloud2( data = word.freq.stems, size = 1.2, minSize = 3, color = "random-light", backgroundColor = "black" ) htmlwidgets::saveWidget( wc_stem, file = "03_report/wordcloud_stemmed.html", selfcontained = TRUE ) No newline at end of file
02_code/R/Game_Basic_Description.R +5 −2 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() # Load the Datasets Loading
02_code/R/task5_1_sentiment_bing_manual.R +6 −2 Original line number Diff line number Diff line # Clear workspace rm(list = ls()) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() load("01_data/reviews_final.RData") Loading
02_code/R/task5_part2 & 3_ R package.R→02_code/R/task5_part2 & 3_syuzhet&nrc.R +8 −3 Original line number Diff line number Diff line rm(list=ls()) # install.packages("syuzhet") # Requires the 'syuzhet' package library(syuzhet) # Set working directory setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ----------------------------- # Setup # ----------------------------- # Set working directory where data and scripts are stored ## setwd() load("01_data/reviews_final_sentiment_manual.RData") Loading
02_code/R/task5_part4_Sentiment_analysis_SharedDeepSeekData.Rdeleted 100644 → 0 +0 −188 Original line number Diff line number Diff line graphics.off() rm(list = ls()) # ================================================= # Set working directory # ================================================= setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment") # ================================================= # Load data (FULL DeepSeek) # ================================================= load("01_data/sentiment_deepseek_255710.RData") load("01_data/reviews_final_sentiment_manual.RData") load("01_data/reviews_final_sentiment_syuzhet.RData") # ================================================= # Extract LLM sentiment (CORRECT for this dataset) # ================================================= llm_sentiment_all <- sentiment.ds$sentiment # Sanity check str(llm_sentiment_all) summary(llm_sentiment_all) # ================================================= # Figure 1: LLM sentiment distribution (ALL reviews) # ================================================= sentiment_scores <- llm_sentiment_all par(mar = c(5, 4, 4, 2) + 0.1) hist( sentiment_scores, breaks = 30, freq = FALSE, col = "lightblue", border = "white", main = "Distribution of Sentiment Scores (All DeepSeek Reviews)", xlab = "Sentiment score (-1 = negative, +1 = positive)" ) lines( density(sentiment_scores, na.rm = TRUE), lwd = 2 ) # ================================================= # Align data for comparisons # ================================================= n_compare <- nrow(reviews_final) llm_sentiment <- tail(llm_sentiment_all, n_compare) # ================================================= # Figure 2: Manual Bing vs LLM # ================================================= manual_sentiment <- reviews_final$sentiment_manual keep <- is.finite(manual_sentiment) & is.finite(llm_sentiment) sum(keep) length(keep) par(mar = c(5, 5, 4, 2) + 0.1) plot( density(manual_sentiment[keep]), ylim = c(0, 4), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), main = "Manual Bing vs LLM Sentiment", xlab = "Sentiment score", ylab = "Density" ) lines( density(llm_sentiment[keep]), col = "black", lwd = 2 ) legend( "topleft", legend = c("Manual Bing", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) # ================================================= # Figure 3: syuzhet # ================================================= syuzhet <- reviews_final$sentiment_syuzhet par(mar = c(4, 4, 3, 1) + 0.1) plot( density(syuzhet, na.rm = TRUE), col = "darkgray", lwd = 2, main = "syuzhet Bing Sentiment (Raw)", xlab = "Raw sentiment score", ylab = "Density" ) # ================================================= # Figure 4: syuzhet vs LLM # ================================================= par(mar = c(5, 5, 4, 2) + 0.1) plot( density(syuzhet, na.rm = TRUE), ylim = c(0, 4), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), main = "syuzhet vs LLM Sentiment", xlab = "Sentiment score (-1 to +1)", ylab = "Density" ) lines( density(llm_sentiment), col = "black", lwd = 2 ) legend( "topleft", legend = c("syuzhet", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) # ================================================= # Figure 5: NRC vs LLM # ================================================= nrc_sentiment <- tail( reviews_final$sentiment_nrc, length(llm_sentiment) ) keep <- is.finite(nrc_sentiment) & is.finite(llm_sentiment) #pdf(file.path(output_path, "nrc_vs_llm_density.pdf"), # width = 7, height = 5) par(mar = c(5, 5, 4, 2) + 0.1) plot( density(nrc_sentiment[keep]), col = "darkgray", lwd = 2, lty = 2, xlim = c(-1, 1), ylim = c(0, 4), main = "NRC Sentiment vs LLM Sentiment", xlab = "Sentiment score", ylab = "Density" ) lines( density(llm_sentiment[keep]), col = "black", lwd = 2, lty = 1 ) legend( "topleft", legend = c("NRC sentiment", "LLM (DeepSeek)"), col = c("darkgray", "black"), lwd = 2, lty = c(2, 1), bty = "n" ) dev.off()