Commit f0c17b80 authored by Sarish Aklujkar's avatar Sarish Aklujkar
Browse files

Some changes have been made for better understanding.

parent 6f26b1a7
Loading
Loading
Loading
Loading
+91 −71
Original line number Diff line number Diff line
@@ -7,10 +7,8 @@ install.packages("tm")
install.packages("RColorBrewer")
install.packages("ggplot2")
install.packages("dyplr")
install.packages("lpSolve")
install.packages("irr")
install.packages("stats")
install.packages("lubridate")
install.packages("xtable")



@@ -21,24 +19,23 @@ library(tm)
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(lpSolve)
library(irr)
library(stats)
library(lubridate)
library(xtable)

#loading necessary dataset
load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")
load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData")
load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/reviews.clean.RData")
load("~/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")
load("~/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData")
load("~/ADS/introads_ass2_team18/01_data/reviews.clean.RData")


# Only keeping necessary columns for analysis
rev.sentiment <- data.frame(id= reviews.clean$recommendationid,  reviews = reviews.clean$reviews)
rev.sentiment <- data.frame(id= reviews.clean$recommendationid,  
                            reviews = reviews.clean$reviews)

# Manual sentiment analysis-----------------------------------------------------

# Loading of necessary words lists taken from Kaggel by Hu and Bing Liu
setwd("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/00_docs")
setwd("~/ADS/introads_ass2_team18/00_docs")

positive.words <- readLines("positive-words.txt")
negative.words <- readLines("negative-words.txt")
@@ -75,38 +72,40 @@ sent.analysis <- function(review) {
}

# Apply sentiment analysis to all reviews in the dataframe
rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis)
rev.sentiment$score.manual <- sapply(rev.sentiment$reviews, sent.analysis)

# Labeling the sentiment score with sentiment
rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive",
                                        ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral"))
rev.sentiment$label.manual <- ifelse(rev.sentiment$score.manual == 1, "Positive",
                                        ifelse(rev.sentiment$score.manual == -1, "Negative", "Neutral"))

# Counting the total number reviews:
positive.counts <- sum(rev.sentiment$sent.label == "Positive")
positive.counts <- sum(rev.sentiment$label.manual == "Positive")
cat("The total number of positive reviews are", positive.counts)

negative.counts <- sum(rev.sentiment$sent.label == "Negative")
negative.counts <- sum(rev.sentiment$label.manual == "Negative")
cat("The total number of negative reviews are", negative.counts)

neutral.counts <- sum(rev.sentiment$sent.label == "Neutral")
neutral.counts <- sum(rev.sentiment$label.manual == "Neutral")
cat("The total number of neutral reviews are", neutral.counts)

# Syuzhet sentiment analysis using bing dictionary------------------------------

# Sentiment analysis using the Bing dictionary
rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing")
rev.sentiment$score.bing <- get_sentiment(rev.sentiment$reviews, method = "bing")

# Labeling the sentiment score with sentiment
rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive",
                                        ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral"))
rev.sentiment$label.bing <- ifelse(rev.sentiment$score.bing >= 1, "Positive",
                                        ifelse(rev.sentiment$score.bing <= -1, 
                                               "Negative", "Neutral"))

# Sentiment scores by manually and with syuzhet package and checking if its identical or not
rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0)
rev.sentiment$identical.score <- ifelse(rev.sentiment$score.manual == 
                                          rev.sentiment$score.bing, 1 , 0)

# Calculate the percentage of identical scores
# Calculating the percentage of identical scores
percentage.identical <- mean(rev.sentiment$identical.score) * 100

# Display the result
# Displaying the result
cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n")

# Sentiment analysis by using other two dictionaries from syuzhet package--------
@@ -116,18 +115,22 @@ rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afin

# Labeling sentiment score with sentiment
rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive",
                                         ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral"))
                                         ifelse(rev.sentiment$score.afinn <= -1, 
                                                "Negative", "Neutral"))

# Sentiment score using nrc dictionary
rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews)

# calculating coorelation between the results of dictionaries
correlation_matrix <- cor(rev.sentiment[c("sent.score", "score.syuzhet", "score.afinn")])
correlation_matrix <- cor(rev.sentiment[c("score.manual", "score.bing", "score.afinn")])

# Print the correlation matrix
print(correlation_matrix)
# Creating LaTeX-formatted table
latex_table <- xtable(correlation_matrix, caption = "Correlation Matrix")

write.csv(rev.sentiment,file = "C:/Users/hetvi/OneDrive/Desktop/rev.sentiment.csv")
# Printing the LaTeX code
print(latex_table, include.rownames = TRUE)

save(rev.sentiment,file = "~/ADS/introads_ass2_team18/01_data/rev.sentiment.RData")
# Emotion analysis---------------------------------------------------

# Extracting the NRC emotion scores for each term
@@ -143,7 +146,8 @@ nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores)
nrc.scores <- nrc.scores[order(-nrc.scores$score),]

# Defining a custom color palette with light colors using RColorBrewer package
custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd")
custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", 
                    "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd")

# Reordering the factor levels of emotion based on scores
nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion)
@@ -153,8 +157,7 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) +
  geom_bar(stat = "identity", color = "black", width = 0.7) +
  geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) + 
  scale_fill_manual(values = custom_palette) +
  labs(title = "Emotion Scores",
       x = "Emotions",
  labs(x = "Emotions",
       y = "Scores") +
  theme(legend.position = "bottom", 
    legend.box.margin = margin(3, 3, 3, 3), 
@@ -164,25 +167,35 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) +
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"),  # Bolds the x-axis text
    plot.margin = margin(25, 25, 25, 25),  # right margin for legend
    legend.key.size = unit(0.5, "cm")) # size adjust for legend-key
ggsave(file= "C:/Users/akluj/OneDrive/Desktop/Emotions.jpg", 
       width=15, height=15, units = "cm", dpi=1600)
ggsave(file = "~/ADS/introads_ass2_team18/03_report/graphs/Emotions.jpg", width = 15, 
       height = 15, units = "cm", dpi = 1600)


# Calculating weightage of emotions
nrc.scores <- nrc.scores %>%
  mutate(weightage = score / sum(score) *100)

cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n")
cat("An emotion with highest weightage is positive with", 
    round(nrc.scores$weightage[1],2),"%\n")

cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n")
cat("An emotion with lowest weightage is disgust with", 
    round(nrc.scores$weightage[10],2),"%\n")

# Q1 Checking the consistency of sentiment with positive and negative voted_up------------------------------------------------

rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE)
#Combined datasets to ensure a comprehensive analysis, mitigating the risk of overlooking 
#or mismatching reviews. Merged two dataframes using a shared identifier, the recommendation ID, 
#for a more cohesive and accurate dataset.
rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", 
                    by.y = "recommendationid", all.x = TRUE)

# ANother datafarme to particularly check on sentiment consistency
# Due to the length of the 'rev.merged' dataframe, a new dataframe was derived to 
#facilitate efficient analysis. This new dataframe selectively extracts essential columns
#and information, streamlining the analysis process.
sent.compare <- data.frame(id = rev.merged$id, reviews = rev.merged$reviews, 
                           sent.label=rev.merged$sent.label, voted_up=rev.merged$voted_up)
                           label.manual=rev.merged$label.manual, voted_up=rev.merged$voted_up)

# A column,voted_up has values True and False,hence labeling the given values as poistive and negative respectively
sent.compare <- sent.compare %>%
  mutate(voted_up_mapped = case_when(
    voted_up==TRUE ~ "Positive",
@@ -193,37 +206,55 @@ sent.compare <- sent.compare %>%
sent.compare$voted_up <- as.factor(sent.compare$voted_up)

# Creating a confusion matrix matrix
conf.matrix <- table(sent.compare$sent.label, sent.compare$voted_up)
print(conf.matrix)
conf.matrix <- table(sent.compare$label.manual, sent.compare$voted_up)

# Printing the LaTeX code for the table
print(xtable(conf.matrix, caption = "Confusion Matrix for Sentiment Analysis"), 
      caption.placement = "top", 
      include.rownames = TRUE,
      include.colnames = TRUE,
      sanitize.text.function = identity)

# Calculating a  percentage agreement
total.obs <- sum(conf.matrix)
cat("The total number of observation are",total.obs)

correct.agreement <- sum(diag(conf.matrix))
cat("The number of reviews which have relevance of manual sentiment score and allocation 
    of positive and negative is",correct.agreement)
percentage.agreement <- correct.agreement / total.obs * 100


# Printing result
cat("Percentage Agreement:", percentage.agreement, "%\n")
cat("A Percentage of the relevance between manual sentiment score and 
    allocation of positive and negative is", round(percentage.agreement,2), "%\n")

# Q2 
# Q2 Helpfulness of sentiment scores--------------------------------------------

# Creating a table of counts for each sentiment label

# Adding another column of votes_up to check on the helpfulness of the review
sent.compare$votes_up <- rev.merged$votes_up
sent.count <- table(sent.compare$sent.label)
sent.count <- table(sent.compare$label.manual)

# Displaying the counts
print(sent.count)

# Creating Latex table
sent.count <- xtable(sent.count, caption = "Distribution of Sentiments")

# Calculating an average helpfulness for each sentiment label
avg.sent <- tapply(
  sent.compare$votes_up,
  sent.compare$sent.label,
  sent.compare$label.manual,
  mean,
  na.rm = TRUE
)

# Converting avg.sent to a data frame
avg.sent <- data.frame(Sentiment = names(avg.sent), Average_Helpful_Votes = avg.sent)
# Creating LaTeX table
avg.sent <- xtable(avg.sent, caption = "Average Helpful Votes by Sentiment")
# Print results
print(avg.sent)

@@ -238,54 +269,42 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe
    "sentiment and its' helpfulness rank is", max.helpfulness.review$votes_up, 
    "which is at",max.help.index)

# I crossed check the review in rev.merged dataframe and I found that its sentiment label with 
# all types of sentiment analysis and it's sentiment is being neutral in all. 

# Q3 

# Calculating the correlation between average sentiment scores and overall review scores
# Q3 Checking the resembleness of overall review sentiment and sentiment score

# An average sentiment score of reviews based of manual sentiment analysis
avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE)
print(avg.sent.score)
avg.sent.score <- mean(rev.sentiment$score.manual, na.rm = TRUE)
cat("An average sentiment score of overall review is",avg.sent.score)

# A review score based from revsummary file, it was dowanloaded in the 1st assisgnement
overall.review.score <- revsummary$review_score
print(overall.review.score)

# Calculating correlation between an average sentiment score and overall review score
# correlation <- cor(avg.sent.score, overall.review.score)

# Printing the correlation coefficient
# print(paste("Correlation between average sentiment and overall score:", correlation))
cat("The overall review score is", overall.review.score)

# Q4 Sentiment variation -------------------------------------------------------

# adding column of date from rev.merged dataframe into sent.compare dataframe
sent.compare$date <- rev.merged$timestamp_created

# adding a column of sent score from rev.merged into sent.compare
sent.compare$sent.score <- rev.merged$sent.score

# Changing class of date column
sent.compare$date <- as.Date(sent.compare$date)

# adding a column of sent score from rev.merged into sent.compare
sent.compare$score.manual <- rev.merged$score.manual

# Aggregating total sentiment scores by date
total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum)
total.score.date <- tapply(sent.compare$score.manual, sent.compare$date, sum)

# Converting into dataframe structure
total.score.date <- data.frame(date = as.Date(names(total.score.date)),
                               total_sentiment = as.numeric(total.score.date))


# Plotting of sentiment score over a period of time
ggplot(data = data.frame(date = as.Date(names(total.score.date)), 
                         sentiment.score = as.numeric(total.score.date))) +
  geom_line(aes(x = date, y = sentiment.score), color = "blue") +
  labs(title = "Sentiment Variation Over Time",
       x = "Date",
ggplot(data = total.score.date) +
  geom_line(aes(x = date, y = total_sentiment), color = "blue") +
  labs(x = "Date",
       y = "Total Sentiment Score") +
  theme_minimal()
ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg", 
ggsave(file= "~/ADS/introads_ass2_team18/03_report/graphs/Sentiment variation.jpg", 
       width=15, height=15, units = "cm", dpi=1600)

# Finding the index of the review with the highest helpful votes
@@ -296,4 +315,5 @@ high.count <- total.score.date[high.index,]
print(high.count)

# Printing a result
cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d"))
cat("A highest sentiment score of", high.count$total_sentiment, 
    "was recorded on", format(high.count$date, "%Y-%m-%d"))