Commit b4ab99ac authored by amra669c's avatar amra669c
Browse files

Finalize PoDS assignment: cleaning, sentiment analysis, README

parent 4ef6be93
Loading
Loading
Loading
Loading
+23 −14
Original line number Diff line number Diff line
# Clear workspace
rm(list = ls())

# Set working directory
setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
# -----------------------------
# Setup
# -----------------------------
# Set working directory where data and scripts are stored

# necessary packages
install.packages("hunspell")
install.packages("wordcloud2")
install.packages("tm")
# setwd()


# Required packages: hunspell, wordcloud2, tm, stringr

library(stringr)
library(hunspell)
@@ -19,7 +21,7 @@ library(tm)

# Load the reviews
load("01_data/raw/gamereviews.RData")
View(game.rev)



# TASK 3: CLEANING
@@ -90,12 +92,8 @@ cat("Cleaned reviews:", nrow(reviews_final), "\n")
cat("Saved to: 01_data/reviews_final.RData\n")

# Create a vector with all words in the reviews
all.words <- NULL
all.words <- unlist(str_split(reviews.final$review, " "))

for (i in 1:nrow(reviews.final)){
  tmp <- str_split(reviews.final$review[i], " ", simplify = TRUE)
  all.words <- c(all.words, tmp)
}

# sort the words in alphabetical order
all.words <- sort(all.words)
@@ -130,5 +128,16 @@ wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", back
word.stems <- unlist(hunspell_stem(all.words.wof))
word.freq.stems <- data.frame(table(word.stems))

wordcloud2(data=word.freq.wof, size=1.2, minSize=3, color = "random-light", backgroundColor = "black")
wc_stem <- wordcloud2(
  data = word.freq.stems,
  size = 1.2,
  minSize = 3,
  color = "random-light",
  backgroundColor = "black"
)

htmlwidgets::saveWidget(
  wc_stem,
  file = "03_report/wordcloud_stemmed.html",
  selfcontained = TRUE
)
 No newline at end of file
+5 −2
Original line number Diff line number Diff line
# Clear workspace
rm(list = ls())

# Set working directory
setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
# -----------------------------
# Setup
# -----------------------------
# Set working directory where data and scripts are stored

## setwd()


# Load the Datasets
+6 −2
Original line number Diff line number Diff line
# Clear workspace
rm(list = ls())

# Set working directory
setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")
# -----------------------------
# Setup
# -----------------------------
# Set working directory where data and scripts are stored

## setwd()

load("01_data/reviews_final.RData")

+8 −3
Original line number Diff line number Diff line
rm(list=ls())


# install.packages("syuzhet")
# Requires the 'syuzhet' package
library(syuzhet)

# Set working directory
setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")

# -----------------------------
# Setup
# -----------------------------
# Set working directory where data and scripts are stored

## setwd()

load("01_data/reviews_final_sentiment_manual.RData")

+0 −188
Original line number Diff line number Diff line
graphics.off()
rm(list = ls())

# =================================================
# Set working directory
# =================================================
setwd("C:/Users/golna.GOLNAZ/Desktop/Hohenheim/Data Science/Assignment")

# =================================================
# Load data (FULL DeepSeek)
# =================================================
load("01_data/sentiment_deepseek_255710.RData")
load("01_data/reviews_final_sentiment_manual.RData")
load("01_data/reviews_final_sentiment_syuzhet.RData")

# =================================================
# Extract LLM sentiment (CORRECT for this dataset)
# =================================================
llm_sentiment_all <- sentiment.ds$sentiment

# Sanity check
str(llm_sentiment_all)
summary(llm_sentiment_all)

# =================================================
# Figure 1: LLM sentiment distribution (ALL reviews)
# =================================================
sentiment_scores <- llm_sentiment_all

par(mar = c(5, 4, 4, 2) + 0.1)

hist(
  sentiment_scores,
  breaks = 30,
  freq = FALSE,
  col = "lightblue",
  border = "white",
  main = "Distribution of Sentiment Scores (All DeepSeek Reviews)",
  xlab = "Sentiment score (-1 = negative, +1 = positive)"
)

lines(
  density(sentiment_scores, na.rm = TRUE),
  lwd = 2
)

# =================================================
# Align data for comparisons
# =================================================
n_compare <- nrow(reviews_final)
llm_sentiment <- tail(llm_sentiment_all, n_compare)

# =================================================
# Figure 2: Manual Bing vs LLM
# =================================================
manual_sentiment <- reviews_final$sentiment_manual
keep <- is.finite(manual_sentiment) & is.finite(llm_sentiment)
sum(keep)
length(keep)

par(mar = c(5, 5, 4, 2) + 0.1)

plot(
  density(manual_sentiment[keep]),
  ylim = c(0, 4),
  col = "darkgray",
  lwd = 2,
  lty = 2,
  xlim = c(-1, 1),
  main = "Manual Bing vs LLM Sentiment",
  xlab = "Sentiment score",
  ylab = "Density"
)

lines(
  density(llm_sentiment[keep]),
  col = "black",
  lwd = 2
)

legend(
  "topleft",
  legend = c("Manual Bing", "LLM (DeepSeek)"),
  col = c("darkgray", "black"),
  lwd = 2,
  lty = c(2, 1),
  bty = "n"
)

# =================================================
# Figure 3: syuzhet 
# =================================================
syuzhet <- reviews_final$sentiment_syuzhet


par(mar = c(4, 4, 3, 1) + 0.1)

plot(
  density(syuzhet, na.rm = TRUE),
  col = "darkgray",
  lwd = 2,
  main = "syuzhet Bing Sentiment (Raw)",
  xlab = "Raw sentiment score",
  ylab = "Density"
)



# =================================================
# Figure 4: syuzhet vs LLM
# =================================================

par(mar = c(5, 5, 4, 2) + 0.1)

plot(
  density(syuzhet, na.rm = TRUE),
  ylim = c(0, 4),
  col = "darkgray",
  lwd = 2,
  lty = 2,
  xlim = c(-1, 1),
  main = "syuzhet vs LLM Sentiment",
  xlab = "Sentiment score (-1 to +1)",
  ylab = "Density"
)

lines(
  density(llm_sentiment),
  col = "black",
  lwd = 2
)

legend(
  "topleft",
  legend = c("syuzhet", "LLM (DeepSeek)"),
  col = c("darkgray", "black"),
  lwd = 2,
  lty = c(2, 1),
  bty = "n"
)

# =================================================
# Figure 5: NRC vs LLM
# =================================================


nrc_sentiment <- tail(
  reviews_final$sentiment_nrc,
  length(llm_sentiment)
)

keep <- is.finite(nrc_sentiment) & is.finite(llm_sentiment)

#pdf(file.path(output_path, "nrc_vs_llm_density.pdf"),
#    width = 7, height = 5)


par(mar = c(5, 5, 4, 2) + 0.1)

plot(
  density(nrc_sentiment[keep]),
  col = "darkgray",
  lwd = 2,
  lty = 2,
  xlim = c(-1, 1),
  ylim = c(0, 4),
  main = "NRC Sentiment vs LLM Sentiment",
  xlab = "Sentiment score",
  ylab = "Density"
)

lines(
  density(llm_sentiment[keep]),
  col = "black",
  lwd = 2,
  lty = 1
)

legend(
  "topleft",
  legend = c("NRC sentiment", "LLM (DeepSeek)"),
  col = c("darkgray", "black"),
  lwd = 2,
  lty = c(2, 1),
  bty = "n"
)

dev.off()
Loading