Finalize PoDS assignment: cleaning, sentiment analysis, README (b4ab99ac) · Commits · Amir Raoufi / PoDS_Ass_Team31

02_code/R/DataCleaning.R

+23 −14

Original line number	Diff line number	Diff line
		# Clear workspace
		rm(list = ls())

		# Set working directory
		setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
		# -----------------------------
		# Setup
		# -----------------------------
		# Set working directory where data and scripts are stored

		# necessary packages
		install.packages("hunspell")
		install.packages("wordcloud2")
		install.packages("tm")
		# setwd()


		# Required packages: hunspell, wordcloud2, tm, stringr

		library(stringr)
		library(hunspell)
		@@ -19,7 +21,7 @@ library(tm)

		# Load the reviews
		load("01_data/raw/gamereviews.RData")
		View(game.rev)



		# TASK 3: CLEANING
		@@ -90,12 +92,8 @@ cat("Cleaned reviews:", nrow(reviews_final), "\n")
		cat("Saved to: 01_data/reviews_final.RData\n")

		# Create a vector with all words in the reviews
		all.words <- NULL
		all.words <- unlist(str_split(reviews.final$review, " "))

		for (i in 1:nrow(reviews.final)){
		tmp <- str_split(reviews.final$review[i], " ", simplify = TRUE)
		all.words <- c(all.words, tmp)
		}

		# sort the words in alphabetical order
		all.words <- sort(all.words)
		@@ -130,5 +128,16 @@ wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", back
		word.stems <- unlist(hunspell_stem(all.words.wof))
		word.freq.stems <- data.frame(table(word.stems))

		wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", backgroundColor = "black")

		wc_stem <- wordcloud2(
		data = word.freq.stems,
		size = 1.2,
		minSize = 3,
		color = "random-light",
		backgroundColor = "black"
		)

		htmlwidgets::saveWidget(
		wc_stem,
		file = "03_report/wordcloud_stemmed.html",
		selfcontained = TRUE
		)
		No newline at end of file

02_code/R/Game_Basic_Description.R

+5 −2

Original line number	Diff line number	Diff line
		# Clear workspace
		rm(list = ls())

		# Set working directory
		setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
		# -----------------------------
		# Setup
		# -----------------------------
		# Set working directory where data and scripts are stored

		## setwd()


		# Load the Datasets

02_code/R/task5_1_sentiment_bing_manual.R

+6 −2

Original line number	Diff line number	Diff line
		# Clear workspace
		rm(list = ls())

		# Set working directory
		setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
		# -----------------------------
		# Setup
		# -----------------------------
		# Set working directory where data and scripts are stored

		## setwd()

		load("01_data/reviews_final.RData")

02_code/R/task5_part2 & 3_ R package.R→02_code/R/task5_part2 & 3_syuzhet&nrc.R

+8 −3

Original line number	Diff line number	Diff line
		rm(list=ls())


		# install.packages("syuzhet")
		# Requires the 'syuzhet' package
		library(syuzhet)

		# Set working directory
		setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")

		# -----------------------------
		# Setup
		# -----------------------------
		# Set working directory where data and scripts are stored

		## setwd()

		load("01_data/reviews_final_sentiment_manual.RData")

02_code/R/task5_part4_Sentiment_analysis_SharedDeepSeekData.R

deleted100644 → 0

+0 −188

Original line number	Diff line number	Diff line
		graphics.off()
		rm(list = ls())

		# =================================================
		# Set working directory
		# =================================================
		setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")

		# =================================================
		# Load data (FULL DeepSeek)
		# =================================================
		load("01_data/sentiment_deepseek_255710.RData")
		load("01_data/reviews_final_sentiment_manual.RData")
		load("01_data/reviews_final_sentiment_syuzhet.RData")

		# =================================================
		# Extract LLM sentiment (CORRECT for this dataset)
		# =================================================
		llm_sentiment_all <- sentiment.ds$sentiment

		# Sanity check
		str(llm_sentiment_all)
		summary(llm_sentiment_all)

		# =================================================
		# Figure 1: LLM sentiment distribution (ALL reviews)
		# =================================================
		sentiment_scores <- llm_sentiment_all

		par(mar = c(5, 4, 4, 2) + 0.1)

		hist(
		sentiment_scores,
		breaks = 30,
		freq = FALSE,
		col = "lightblue",
		border = "white",
		main = "Distribution of Sentiment Scores (All DeepSeek Reviews)",
		xlab = "Sentiment score (-1 = negative, +1 = positive)"
		)

		lines(
		density(sentiment_scores, na.rm = TRUE),
		lwd = 2
		)

		# =================================================
		# Align data for comparisons
		# =================================================
		n_compare <- nrow(reviews_final)
		llm_sentiment <- tail(llm_sentiment_all, n_compare)

		# =================================================
		# Figure 2: Manual Bing vs LLM
		# =================================================
		manual_sentiment <- reviews_final$sentiment_manual
		keep <- is.finite(manual_sentiment) & is.finite(llm_sentiment)
		sum(keep)
		length(keep)

		par(mar = c(5, 5, 4, 2) + 0.1)

		plot(
		density(manual_sentiment[keep]),
		ylim = c(0, 4),
		col = "darkgray",
		lwd = 2,
		lty = 2,
		xlim = c(-1, 1),
		main = "Manual Bing vs LLM Sentiment",
		xlab = "Sentiment score",
		ylab = "Density"
		)

		lines(
		density(llm_sentiment[keep]),
		col = "black",
		lwd = 2
		)

		legend(
		"topleft",
		legend = c("Manual Bing", "LLM (DeepSeek)"),
		col = c("darkgray", "black"),
		lwd = 2,
		lty = c(2, 1),
		bty = "n"
		)

		# =================================================
		# Figure 3: syuzhet
		# =================================================
		syuzhet <- reviews_final$sentiment_syuzhet


		par(mar = c(4, 4, 3, 1) + 0.1)

		plot(
		density(syuzhet, na.rm = TRUE),
		col = "darkgray",
		lwd = 2,
		main = "syuzhet Bing Sentiment (Raw)",
		xlab = "Raw sentiment score",
		ylab = "Density"
		)



		# =================================================
		# Figure 4: syuzhet vs LLM
		# =================================================

		par(mar = c(5, 5, 4, 2) + 0.1)

		plot(
		density(syuzhet, na.rm = TRUE),
		ylim = c(0, 4),
		col = "darkgray",
		lwd = 2,
		lty = 2,
		xlim = c(-1, 1),
		main = "syuzhet vs LLM Sentiment",
		xlab = "Sentiment score (-1 to +1)",
		ylab = "Density"
		)

		lines(
		density(llm_sentiment),
		col = "black",
		lwd = 2
		)

		legend(
		"topleft",
		legend = c("syuzhet", "LLM (DeepSeek)"),
		col = c("darkgray", "black"),
		lwd = 2,
		lty = c(2, 1),
		bty = "n"
		)

		# =================================================
		# Figure 5: NRC vs LLM
		# =================================================


		nrc_sentiment <- tail(
		reviews_final$sentiment_nrc,
		length(llm_sentiment)
		)

		keep <- is.finite(nrc_sentiment) & is.finite(llm_sentiment)

		#pdf(file.path(output_path, "nrc_vs_llm_density.pdf"),
		# width = 7, height = 5)


		par(mar = c(5, 5, 4, 2) + 0.1)

		plot(
		density(nrc_sentiment[keep]),
		col = "darkgray",
		lwd = 2,
		lty = 2,
		xlim = c(-1, 1),
		ylim = c(0, 4),
		main = "NRC Sentiment vs LLM Sentiment",
		xlab = "Sentiment score",
		ylab = "Density"
		)

		lines(
		density(llm_sentiment[keep]),
		col = "black",
		lwd = 2,
		lty = 1
		)

		legend(
		"topleft",
		legend = c("NRC sentiment", "LLM (DeepSeek)"),
		col = c("darkgray", "black"),
		lwd = 2,
		lty = c(2, 1),
		bty = "n"
		)

		dev.off()