Some changes have been made for better understanding. (f0c17b80) · Commits · Hetvi Ariwala / IntroADS_Ass2_Team18

02_code/R/Script 2 - Sentiment analysis.R

+91 −71

Original line number	Diff line number	Diff line
		@@ -7,10 +7,8 @@ install.packages("tm")
		install.packages("RColorBrewer")
		install.packages("ggplot2")
		install.packages("dyplr")
		install.packages("lpSolve")
		install.packages("irr")
		install.packages("stats")
		install.packages("lubridate")
		install.packages("xtable")



		@@ -21,24 +19,23 @@ library(tm)
		library(RColorBrewer)
		library(ggplot2)
		library(dplyr)
		library(lpSolve)
		library(irr)
		library(stats)
		library(lubridate)
		library(xtable)

		#loading necessary dataset
		load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")
		load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData")
		load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/reviews.clean.RData")
		load("~/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")
		load("~/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData")
		load("~/ADS/introads_ass2_team18/01_data/reviews.clean.RData")


		# Only keeping necessary columns for analysis
		rev.sentiment <- data.frame(id= reviews.clean$recommendationid, reviews = reviews.clean$reviews)
		rev.sentiment <- data.frame(id= reviews.clean$recommendationid,
		reviews = reviews.clean$reviews)

		# Manual sentiment analysis-----------------------------------------------------

		# Loading of necessary words lists taken from Kaggel by Hu and Bing Liu
		setwd("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/00_docs")
		setwd("~/ADS/introads_ass2_team18/00_docs")

		positive.words <- readLines("positive-words.txt")
		negative.words <- readLines("negative-words.txt")
		@@ -75,38 +72,40 @@ sent.analysis <- function(review) {
		}

		# Apply sentiment analysis to all reviews in the dataframe
		rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis)
		rev.sentiment$score.manual <- sapply(rev.sentiment$reviews, sent.analysis)

		# Labeling the sentiment score with sentiment
		rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive",
		ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral"))
		rev.sentiment$label.manual <- ifelse(rev.sentiment$score.manual == 1, "Positive",
		ifelse(rev.sentiment$score.manual == -1, "Negative", "Neutral"))

		# Counting the total number reviews:
		positive.counts <- sum(rev.sentiment$sent.label == "Positive")
		positive.counts <- sum(rev.sentiment$label.manual == "Positive")
		cat("The total number of positive reviews are", positive.counts)

		negative.counts <- sum(rev.sentiment$sent.label == "Negative")
		negative.counts <- sum(rev.sentiment$label.manual == "Negative")
		cat("The total number of negative reviews are", negative.counts)

		neutral.counts <- sum(rev.sentiment$sent.label == "Neutral")
		neutral.counts <- sum(rev.sentiment$label.manual == "Neutral")
		cat("The total number of neutral reviews are", neutral.counts)

		# Syuzhet sentiment analysis using bing dictionary------------------------------

		# Sentiment analysis using the Bing dictionary
		rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing")
		rev.sentiment$score.bing <- get_sentiment(rev.sentiment$reviews, method = "bing")

		# Labeling the sentiment score with sentiment
		rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive",
		ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral"))
		rev.sentiment$label.bing <- ifelse(rev.sentiment$score.bing >= 1, "Positive",
		ifelse(rev.sentiment$score.bing <= -1,
		"Negative", "Neutral"))

		# Sentiment scores by manually and with syuzhet package and checking if its identical or not
		rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0)
		rev.sentiment$identical.score <- ifelse(rev.sentiment$score.manual ==
		rev.sentiment$score.bing, 1 , 0)

		# Calculate the percentage of identical scores
		# Calculating the percentage of identical scores
		percentage.identical <- mean(rev.sentiment$identical.score) * 100

		# Display the result
		# Displaying the result
		cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n")

		# Sentiment analysis by using other two dictionaries from syuzhet package--------
		@@ -116,18 +115,22 @@ rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afin

		# Labeling sentiment score with sentiment
		rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive",
		ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral"))
		ifelse(rev.sentiment$score.afinn <= -1,
		"Negative", "Neutral"))

		# Sentiment score using nrc dictionary
		rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews)

		# calculating coorelation between the results of dictionaries
		correlation_matrix <- cor(rev.sentiment[c("sent.score", "score.syuzhet", "score.afinn")])
		correlation_matrix <- cor(rev.sentiment[c("score.manual", "score.bing", "score.afinn")])

		# Print the correlation matrix
		print(correlation_matrix)
		# Creating LaTeX-formatted table
		latex_table <- xtable(correlation_matrix, caption = "Correlation Matrix")

		write.csv(rev.sentiment,file = "C:/Users/hetvi/OneDrive/Desktop/rev.sentiment.csv")
		# Printing the LaTeX code
		print(latex_table, include.rownames = TRUE)

		save(rev.sentiment,file = "~/ADS/introads_ass2_team18/01_data/rev.sentiment.RData")
		# Emotion analysis---------------------------------------------------

		# Extracting the NRC emotion scores for each term
		@@ -143,7 +146,8 @@ nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores)
		nrc.scores <- nrc.scores[order(-nrc.scores$score),]

		# Defining a custom color palette with light colors using RColorBrewer package
		custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd")
		custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5",
		"#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd")

		# Reordering the factor levels of emotion based on scores
		nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion)
		@@ -153,8 +157,7 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) +
		geom_bar(stat = "identity", color = "black", width = 0.7) +
		geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) +
		scale_fill_manual(values = custom_palette) +
		labs(title = "Emotion Scores",
		x = "Emotions",
		labs(x = "Emotions",
		y = "Scores") +
		theme(legend.position = "bottom",
		legend.box.margin = margin(3, 3, 3, 3),
		@@ -164,25 +167,35 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) +
		axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), # Bolds the x-axis text
		plot.margin = margin(25, 25, 25, 25), # right margin for legend
		legend.key.size = unit(0.5, "cm")) # size adjust for legend-key
		ggsave(file= "C:/Users/akluj/OneDrive/Desktop/Emotions.jpg",
		width=15, height=15, units = "cm", dpi=1600)
		ggsave(file = "~/ADS/introads_ass2_team18/03_report/graphs/Emotions.jpg", width = 15,
		height = 15, units = "cm", dpi = 1600)


		# Calculating weightage of emotions
		nrc.scores <- nrc.scores %>%
		mutate(weightage = score / sum(score) *100)

		cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n")
		cat("An emotion with highest weightage is positive with",
		round(nrc.scores$weightage[1],2),"%\n")

		cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n")
		cat("An emotion with lowest weightage is disgust with",
		round(nrc.scores$weightage[10],2),"%\n")

		# Q1 Checking the consistency of sentiment with positive and negative voted_up------------------------------------------------

		rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE)
		#Combined datasets to ensure a comprehensive analysis, mitigating the risk of overlooking
		#or mismatching reviews. Merged two dataframes using a shared identifier, the recommendation ID,
		#for a more cohesive and accurate dataset.
		rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id",
		by.y = "recommendationid", all.x = TRUE)

		# ANother datafarme to particularly check on sentiment consistency
		# Due to the length of the 'rev.merged' dataframe, a new dataframe was derived to
		#facilitate efficient analysis. This new dataframe selectively extracts essential columns
		#and information, streamlining the analysis process.
		sent.compare <- data.frame(id = rev.merged$id, reviews = rev.merged$reviews,
		sent.label=rev.merged$sent.label, voted_up=rev.merged$voted_up)
		label.manual=rev.merged$label.manual, voted_up=rev.merged$voted_up)

		# A column,voted_up has values True and False,hence labeling the given values as poistive and negative respectively
		sent.compare <- sent.compare %>%
		mutate(voted_up_mapped = case_when(
		voted_up==TRUE ~ "Positive",
		@@ -193,37 +206,55 @@ sent.compare <- sent.compare %>%
		sent.compare$voted_up <- as.factor(sent.compare$voted_up)

		# Creating a confusion matrix matrix
		conf.matrix <- table(sent.compare$sent.label, sent.compare$voted_up)
		print(conf.matrix)
		conf.matrix <- table(sent.compare$label.manual, sent.compare$voted_up)

		# Printing the LaTeX code for the table
		print(xtable(conf.matrix, caption = "Confusion Matrix for Sentiment Analysis"),
		caption.placement = "top",
		include.rownames = TRUE,
		include.colnames = TRUE,
		sanitize.text.function = identity)

		# Calculating a percentage agreement
		total.obs <- sum(conf.matrix)
		cat("The total number of observation are",total.obs)

		correct.agreement <- sum(diag(conf.matrix))
		cat("The number of reviews which have relevance of manual sentiment score and allocation
		of positive and negative is",correct.agreement)
		percentage.agreement <- correct.agreement / total.obs * 100


		# Printing result
		cat("Percentage Agreement:", percentage.agreement, "%\n")
		cat("A Percentage of the relevance between manual sentiment score and
		allocation of positive and negative is", round(percentage.agreement,2), "%\n")

		# Q2
		# Q2 Helpfulness of sentiment scores--------------------------------------------

		# Creating a table of counts for each sentiment label

		# Adding another column of votes_up to check on the helpfulness of the review
		sent.compare$votes_up <- rev.merged$votes_up
		sent.count <- table(sent.compare$sent.label)
		sent.count <- table(sent.compare$label.manual)

		# Displaying the counts
		print(sent.count)

		# Creating Latex table
		sent.count <- xtable(sent.count, caption = "Distribution of Sentiments")

		# Calculating an average helpfulness for each sentiment label
		avg.sent <- tapply(
		sent.compare$votes_up,
		sent.compare$sent.label,
		sent.compare$label.manual,
		mean,
		na.rm = TRUE
		)

		# Converting avg.sent to a data frame
		avg.sent <- data.frame(Sentiment = names(avg.sent), Average_Helpful_Votes = avg.sent)
		# Creating LaTeX table
		avg.sent <- xtable(avg.sent, caption = "Average Helpful Votes by Sentiment")
		# Print results
		print(avg.sent)

		@@ -238,54 +269,42 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe
		"sentiment and its' helpfulness rank is", max.helpfulness.review$votes_up,
		"which is at",max.help.index)

		# I crossed check the review in rev.merged dataframe and I found that its sentiment label with
		# all types of sentiment analysis and it's sentiment is being neutral in all.

		# Q3

		# Calculating the correlation between average sentiment scores and overall review scores
		# Q3 Checking the resembleness of overall review sentiment and sentiment score

		# An average sentiment score of reviews based of manual sentiment analysis
		avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE)
		print(avg.sent.score)
		avg.sent.score <- mean(rev.sentiment$score.manual, na.rm = TRUE)
		cat("An average sentiment score of overall review is",avg.sent.score)

		# A review score based from revsummary file, it was dowanloaded in the 1st assisgnement
		overall.review.score <- revsummary$review_score
		print(overall.review.score)

		# Calculating correlation between an average sentiment score and overall review score
		# correlation <- cor(avg.sent.score, overall.review.score)

		# Printing the correlation coefficient
		# print(paste("Correlation between average sentiment and overall score:", correlation))
		cat("The overall review score is", overall.review.score)

		# Q4 Sentiment variation -------------------------------------------------------

		# adding column of date from rev.merged dataframe into sent.compare dataframe
		sent.compare$date <- rev.merged$timestamp_created

		# adding a column of sent score from rev.merged into sent.compare
		sent.compare$sent.score <- rev.merged$sent.score

		# Changing class of date column
		sent.compare$date <- as.Date(sent.compare$date)

		# adding a column of sent score from rev.merged into sent.compare
		sent.compare$score.manual <- rev.merged$score.manual

		# Aggregating total sentiment scores by date
		total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum)
		total.score.date <- tapply(sent.compare$score.manual, sent.compare$date, sum)

		# Converting into dataframe structure
		total.score.date <- data.frame(date = as.Date(names(total.score.date)),
		total_sentiment = as.numeric(total.score.date))


		# Plotting of sentiment score over a period of time
		ggplot(data = data.frame(date = as.Date(names(total.score.date)),
		sentiment.score = as.numeric(total.score.date))) +
		geom_line(aes(x = date, y = sentiment.score), color = "blue") +
		labs(title = "Sentiment Variation Over Time",
		x = "Date",
		ggplot(data = total.score.date) +
		geom_line(aes(x = date, y = total_sentiment), color = "blue") +
		labs(x = "Date",
		y = "Total Sentiment Score") +
		theme_minimal()
		ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg",
		ggsave(file= "~/ADS/introads_ass2_team18/03_report/graphs/Sentiment variation.jpg",
		width=15, height=15, units = "cm", dpi=1600)

		# Finding the index of the review with the highest helpful votes
		@@ -296,4 +315,5 @@ high.count <- total.score.date[high.index,]
		print(high.count)

		# Printing a result
		cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d"))
		cat("A highest sentiment score of", high.count$total_sentiment,
		"was recorded on", format(high.count$date, "%Y-%m-%d"))