Loading 02_code/R/Script 2 - Sentiment analysis.R +91 −71 Original line number Diff line number Diff line Loading @@ -7,10 +7,8 @@ install.packages("tm") install.packages("RColorBrewer") install.packages("ggplot2") install.packages("dyplr") install.packages("lpSolve") install.packages("irr") install.packages("stats") install.packages("lubridate") install.packages("xtable") Loading @@ -21,24 +19,23 @@ library(tm) library(RColorBrewer) library(ggplot2) library(dplyr) library(lpSolve) library(irr) library(stats) library(lubridate) library(xtable) #loading necessary dataset load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") load("~/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("~/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("~/ADS/introads_ass2_team18/01_data/reviews.clean.RData") # Only keeping necessary columns for analysis rev.sentiment <- data.frame(id= reviews.clean$recommendationid, reviews = reviews.clean$reviews) rev.sentiment <- data.frame(id= reviews.clean$recommendationid, reviews = reviews.clean$reviews) # Manual sentiment analysis----------------------------------------------------- # Loading of necessary words lists taken from Kaggel by Hu and Bing Liu setwd("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/00_docs") setwd("~/ADS/introads_ass2_team18/00_docs") positive.words <- readLines("positive-words.txt") negative.words <- readLines("negative-words.txt") Loading Loading @@ -75,38 +72,40 @@ sent.analysis <- function(review) { } # Apply sentiment analysis to all reviews in the dataframe rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis) rev.sentiment$score.manual <- sapply(rev.sentiment$reviews, sent.analysis) # Labeling the sentiment score with sentiment rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive", ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral")) rev.sentiment$label.manual <- ifelse(rev.sentiment$score.manual == 1, "Positive", ifelse(rev.sentiment$score.manual == -1, "Negative", "Neutral")) # Counting the total number reviews: positive.counts <- sum(rev.sentiment$sent.label == "Positive") positive.counts <- sum(rev.sentiment$label.manual == "Positive") cat("The total number of positive reviews are", positive.counts) negative.counts <- sum(rev.sentiment$sent.label == "Negative") negative.counts <- sum(rev.sentiment$label.manual == "Negative") cat("The total number of negative reviews are", negative.counts) neutral.counts <- sum(rev.sentiment$sent.label == "Neutral") neutral.counts <- sum(rev.sentiment$label.manual == "Neutral") cat("The total number of neutral reviews are", neutral.counts) # Syuzhet sentiment analysis using bing dictionary------------------------------ # Sentiment analysis using the Bing dictionary rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing") rev.sentiment$score.bing <- get_sentiment(rev.sentiment$reviews, method = "bing") # Labeling the sentiment score with sentiment rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral")) rev.sentiment$label.bing <- ifelse(rev.sentiment$score.bing >= 1, "Positive", ifelse(rev.sentiment$score.bing <= -1, "Negative", "Neutral")) # Sentiment scores by manually and with syuzhet package and checking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0) rev.sentiment$identical.score <- ifelse(rev.sentiment$score.manual == rev.sentiment$score.bing, 1 , 0) # Calculate the percentage of identical scores # Calculating the percentage of identical scores percentage.identical <- mean(rev.sentiment$identical.score) * 100 # Display the result # Displaying the result cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n") # Sentiment analysis by using other two dictionaries from syuzhet package-------- Loading @@ -116,18 +115,22 @@ rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afin # Labeling sentiment score with sentiment rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) # Sentiment score using nrc dictionary rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews) # calculating coorelation between the results of dictionaries correlation_matrix <- cor(rev.sentiment[c("sent.score", "score.syuzhet", "score.afinn")]) correlation_matrix <- cor(rev.sentiment[c("score.manual", "score.bing", "score.afinn")]) # Print the correlation matrix print(correlation_matrix) # Creating LaTeX-formatted table latex_table <- xtable(correlation_matrix, caption = "Correlation Matrix") write.csv(rev.sentiment,file = "C:/Users/hetvi/OneDrive/Desktop/rev.sentiment.csv") # Printing the LaTeX code print(latex_table, include.rownames = TRUE) save(rev.sentiment,file = "~/ADS/introads_ass2_team18/01_data/rev.sentiment.RData") # Emotion analysis--------------------------------------------------- # Extracting the NRC emotion scores for each term Loading @@ -143,7 +146,8 @@ nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores) nrc.scores <- nrc.scores[order(-nrc.scores$score),] # Defining a custom color palette with light colors using RColorBrewer package custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") # Reordering the factor levels of emotion based on scores nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion) Loading @@ -153,8 +157,7 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + geom_bar(stat = "identity", color = "black", width = 0.7) + geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) + scale_fill_manual(values = custom_palette) + labs(title = "Emotion Scores", x = "Emotions", labs(x = "Emotions", y = "Scores") + theme(legend.position = "bottom", legend.box.margin = margin(3, 3, 3, 3), Loading @@ -164,25 +167,35 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), # Bolds the x-axis text plot.margin = margin(25, 25, 25, 25), # right margin for legend legend.key.size = unit(0.5, "cm")) # size adjust for legend-key ggsave(file= "C:/Users/akluj/OneDrive/Desktop/Emotions.jpg", width=15, height=15, units = "cm", dpi=1600) ggsave(file = "~/ADS/introads_ass2_team18/03_report/graphs/Emotions.jpg", width = 15, height = 15, units = "cm", dpi = 1600) # Calculating weightage of emotions nrc.scores <- nrc.scores %>% mutate(weightage = score / sum(score) *100) cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") # Q1 Checking the consistency of sentiment with positive and negative voted_up------------------------------------------------ rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE) #Combined datasets to ensure a comprehensive analysis, mitigating the risk of overlooking #or mismatching reviews. Merged two dataframes using a shared identifier, the recommendation ID, #for a more cohesive and accurate dataset. rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE) # ANother datafarme to particularly check on sentiment consistency # Due to the length of the 'rev.merged' dataframe, a new dataframe was derived to #facilitate efficient analysis. This new dataframe selectively extracts essential columns #and information, streamlining the analysis process. sent.compare <- data.frame(id = rev.merged$id, reviews = rev.merged$reviews, sent.label=rev.merged$sent.label, voted_up=rev.merged$voted_up) label.manual=rev.merged$label.manual, voted_up=rev.merged$voted_up) # A column,voted_up has values True and False,hence labeling the given values as poistive and negative respectively sent.compare <- sent.compare %>% mutate(voted_up_mapped = case_when( voted_up==TRUE ~ "Positive", Loading @@ -193,37 +206,55 @@ sent.compare <- sent.compare %>% sent.compare$voted_up <- as.factor(sent.compare$voted_up) # Creating a confusion matrix matrix conf.matrix <- table(sent.compare$sent.label, sent.compare$voted_up) print(conf.matrix) conf.matrix <- table(sent.compare$label.manual, sent.compare$voted_up) # Printing the LaTeX code for the table print(xtable(conf.matrix, caption = "Confusion Matrix for Sentiment Analysis"), caption.placement = "top", include.rownames = TRUE, include.colnames = TRUE, sanitize.text.function = identity) # Calculating a percentage agreement total.obs <- sum(conf.matrix) cat("The total number of observation are",total.obs) correct.agreement <- sum(diag(conf.matrix)) cat("The number of reviews which have relevance of manual sentiment score and allocation of positive and negative is",correct.agreement) percentage.agreement <- correct.agreement / total.obs * 100 # Printing result cat("Percentage Agreement:", percentage.agreement, "%\n") cat("A Percentage of the relevance between manual sentiment score and allocation of positive and negative is", round(percentage.agreement,2), "%\n") # Q2 # Q2 Helpfulness of sentiment scores-------------------------------------------- # Creating a table of counts for each sentiment label # Adding another column of votes_up to check on the helpfulness of the review sent.compare$votes_up <- rev.merged$votes_up sent.count <- table(sent.compare$sent.label) sent.count <- table(sent.compare$label.manual) # Displaying the counts print(sent.count) # Creating Latex table sent.count <- xtable(sent.count, caption = "Distribution of Sentiments") # Calculating an average helpfulness for each sentiment label avg.sent <- tapply( sent.compare$votes_up, sent.compare$sent.label, sent.compare$label.manual, mean, na.rm = TRUE ) # Converting avg.sent to a data frame avg.sent <- data.frame(Sentiment = names(avg.sent), Average_Helpful_Votes = avg.sent) # Creating LaTeX table avg.sent <- xtable(avg.sent, caption = "Average Helpful Votes by Sentiment") # Print results print(avg.sent) Loading @@ -238,54 +269,42 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe "sentiment and its' helpfulness rank is", max.helpfulness.review$votes_up, "which is at",max.help.index) # I crossed check the review in rev.merged dataframe and I found that its sentiment label with # all types of sentiment analysis and it's sentiment is being neutral in all. # Q3 # Calculating the correlation between average sentiment scores and overall review scores # Q3 Checking the resembleness of overall review sentiment and sentiment score # An average sentiment score of reviews based of manual sentiment analysis avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE) print(avg.sent.score) avg.sent.score <- mean(rev.sentiment$score.manual, na.rm = TRUE) cat("An average sentiment score of overall review is",avg.sent.score) # A review score based from revsummary file, it was dowanloaded in the 1st assisgnement overall.review.score <- revsummary$review_score print(overall.review.score) # Calculating correlation between an average sentiment score and overall review score # correlation <- cor(avg.sent.score, overall.review.score) # Printing the correlation coefficient # print(paste("Correlation between average sentiment and overall score:", correlation)) cat("The overall review score is", overall.review.score) # Q4 Sentiment variation ------------------------------------------------------- # adding column of date from rev.merged dataframe into sent.compare dataframe sent.compare$date <- rev.merged$timestamp_created # adding a column of sent score from rev.merged into sent.compare sent.compare$sent.score <- rev.merged$sent.score # Changing class of date column sent.compare$date <- as.Date(sent.compare$date) # adding a column of sent score from rev.merged into sent.compare sent.compare$score.manual <- rev.merged$score.manual # Aggregating total sentiment scores by date total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum) total.score.date <- tapply(sent.compare$score.manual, sent.compare$date, sum) # Converting into dataframe structure total.score.date <- data.frame(date = as.Date(names(total.score.date)), total_sentiment = as.numeric(total.score.date)) # Plotting of sentiment score over a period of time ggplot(data = data.frame(date = as.Date(names(total.score.date)), sentiment.score = as.numeric(total.score.date))) + geom_line(aes(x = date, y = sentiment.score), color = "blue") + labs(title = "Sentiment Variation Over Time", x = "Date", ggplot(data = total.score.date) + geom_line(aes(x = date, y = total_sentiment), color = "blue") + labs(x = "Date", y = "Total Sentiment Score") + theme_minimal() ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg", ggsave(file= "~/ADS/introads_ass2_team18/03_report/graphs/Sentiment variation.jpg", width=15, height=15, units = "cm", dpi=1600) # Finding the index of the review with the highest helpful votes Loading @@ -296,4 +315,5 @@ high.count <- total.score.date[high.index,] print(high.count) # Printing a result cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d")) cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d")) Loading
02_code/R/Script 2 - Sentiment analysis.R +91 −71 Original line number Diff line number Diff line Loading @@ -7,10 +7,8 @@ install.packages("tm") install.packages("RColorBrewer") install.packages("ggplot2") install.packages("dyplr") install.packages("lpSolve") install.packages("irr") install.packages("stats") install.packages("lubridate") install.packages("xtable") Loading @@ -21,24 +19,23 @@ library(tm) library(RColorBrewer) library(ggplot2) library(dplyr) library(lpSolve) library(irr) library(stats) library(lubridate) library(xtable) #loading necessary dataset load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") load("~/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("~/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("~/ADS/introads_ass2_team18/01_data/reviews.clean.RData") # Only keeping necessary columns for analysis rev.sentiment <- data.frame(id= reviews.clean$recommendationid, reviews = reviews.clean$reviews) rev.sentiment <- data.frame(id= reviews.clean$recommendationid, reviews = reviews.clean$reviews) # Manual sentiment analysis----------------------------------------------------- # Loading of necessary words lists taken from Kaggel by Hu and Bing Liu setwd("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/00_docs") setwd("~/ADS/introads_ass2_team18/00_docs") positive.words <- readLines("positive-words.txt") negative.words <- readLines("negative-words.txt") Loading Loading @@ -75,38 +72,40 @@ sent.analysis <- function(review) { } # Apply sentiment analysis to all reviews in the dataframe rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis) rev.sentiment$score.manual <- sapply(rev.sentiment$reviews, sent.analysis) # Labeling the sentiment score with sentiment rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive", ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral")) rev.sentiment$label.manual <- ifelse(rev.sentiment$score.manual == 1, "Positive", ifelse(rev.sentiment$score.manual == -1, "Negative", "Neutral")) # Counting the total number reviews: positive.counts <- sum(rev.sentiment$sent.label == "Positive") positive.counts <- sum(rev.sentiment$label.manual == "Positive") cat("The total number of positive reviews are", positive.counts) negative.counts <- sum(rev.sentiment$sent.label == "Negative") negative.counts <- sum(rev.sentiment$label.manual == "Negative") cat("The total number of negative reviews are", negative.counts) neutral.counts <- sum(rev.sentiment$sent.label == "Neutral") neutral.counts <- sum(rev.sentiment$label.manual == "Neutral") cat("The total number of neutral reviews are", neutral.counts) # Syuzhet sentiment analysis using bing dictionary------------------------------ # Sentiment analysis using the Bing dictionary rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing") rev.sentiment$score.bing <- get_sentiment(rev.sentiment$reviews, method = "bing") # Labeling the sentiment score with sentiment rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral")) rev.sentiment$label.bing <- ifelse(rev.sentiment$score.bing >= 1, "Positive", ifelse(rev.sentiment$score.bing <= -1, "Negative", "Neutral")) # Sentiment scores by manually and with syuzhet package and checking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0) rev.sentiment$identical.score <- ifelse(rev.sentiment$score.manual == rev.sentiment$score.bing, 1 , 0) # Calculate the percentage of identical scores # Calculating the percentage of identical scores percentage.identical <- mean(rev.sentiment$identical.score) * 100 # Display the result # Displaying the result cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n") # Sentiment analysis by using other two dictionaries from syuzhet package-------- Loading @@ -116,18 +115,22 @@ rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afin # Labeling sentiment score with sentiment rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) # Sentiment score using nrc dictionary rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews) # calculating coorelation between the results of dictionaries correlation_matrix <- cor(rev.sentiment[c("sent.score", "score.syuzhet", "score.afinn")]) correlation_matrix <- cor(rev.sentiment[c("score.manual", "score.bing", "score.afinn")]) # Print the correlation matrix print(correlation_matrix) # Creating LaTeX-formatted table latex_table <- xtable(correlation_matrix, caption = "Correlation Matrix") write.csv(rev.sentiment,file = "C:/Users/hetvi/OneDrive/Desktop/rev.sentiment.csv") # Printing the LaTeX code print(latex_table, include.rownames = TRUE) save(rev.sentiment,file = "~/ADS/introads_ass2_team18/01_data/rev.sentiment.RData") # Emotion analysis--------------------------------------------------- # Extracting the NRC emotion scores for each term Loading @@ -143,7 +146,8 @@ nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores) nrc.scores <- nrc.scores[order(-nrc.scores$score),] # Defining a custom color palette with light colors using RColorBrewer package custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") # Reordering the factor levels of emotion based on scores nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion) Loading @@ -153,8 +157,7 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + geom_bar(stat = "identity", color = "black", width = 0.7) + geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) + scale_fill_manual(values = custom_palette) + labs(title = "Emotion Scores", x = "Emotions", labs(x = "Emotions", y = "Scores") + theme(legend.position = "bottom", legend.box.margin = margin(3, 3, 3, 3), Loading @@ -164,25 +167,35 @@ ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), # Bolds the x-axis text plot.margin = margin(25, 25, 25, 25), # right margin for legend legend.key.size = unit(0.5, "cm")) # size adjust for legend-key ggsave(file= "C:/Users/akluj/OneDrive/Desktop/Emotions.jpg", width=15, height=15, units = "cm", dpi=1600) ggsave(file = "~/ADS/introads_ass2_team18/03_report/graphs/Emotions.jpg", width = 15, height = 15, units = "cm", dpi = 1600) # Calculating weightage of emotions nrc.scores <- nrc.scores %>% mutate(weightage = score / sum(score) *100) cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") cat("An emotion with lowest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") # Q1 Checking the consistency of sentiment with positive and negative voted_up------------------------------------------------ rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE) #Combined datasets to ensure a comprehensive analysis, mitigating the risk of overlooking #or mismatching reviews. Merged two dataframes using a shared identifier, the recommendation ID, #for a more cohesive and accurate dataset. rev.merged <- merge(rev.sentiment, gamereviews, by.x = "id", by.y = "recommendationid", all.x = TRUE) # ANother datafarme to particularly check on sentiment consistency # Due to the length of the 'rev.merged' dataframe, a new dataframe was derived to #facilitate efficient analysis. This new dataframe selectively extracts essential columns #and information, streamlining the analysis process. sent.compare <- data.frame(id = rev.merged$id, reviews = rev.merged$reviews, sent.label=rev.merged$sent.label, voted_up=rev.merged$voted_up) label.manual=rev.merged$label.manual, voted_up=rev.merged$voted_up) # A column,voted_up has values True and False,hence labeling the given values as poistive and negative respectively sent.compare <- sent.compare %>% mutate(voted_up_mapped = case_when( voted_up==TRUE ~ "Positive", Loading @@ -193,37 +206,55 @@ sent.compare <- sent.compare %>% sent.compare$voted_up <- as.factor(sent.compare$voted_up) # Creating a confusion matrix matrix conf.matrix <- table(sent.compare$sent.label, sent.compare$voted_up) print(conf.matrix) conf.matrix <- table(sent.compare$label.manual, sent.compare$voted_up) # Printing the LaTeX code for the table print(xtable(conf.matrix, caption = "Confusion Matrix for Sentiment Analysis"), caption.placement = "top", include.rownames = TRUE, include.colnames = TRUE, sanitize.text.function = identity) # Calculating a percentage agreement total.obs <- sum(conf.matrix) cat("The total number of observation are",total.obs) correct.agreement <- sum(diag(conf.matrix)) cat("The number of reviews which have relevance of manual sentiment score and allocation of positive and negative is",correct.agreement) percentage.agreement <- correct.agreement / total.obs * 100 # Printing result cat("Percentage Agreement:", percentage.agreement, "%\n") cat("A Percentage of the relevance between manual sentiment score and allocation of positive and negative is", round(percentage.agreement,2), "%\n") # Q2 # Q2 Helpfulness of sentiment scores-------------------------------------------- # Creating a table of counts for each sentiment label # Adding another column of votes_up to check on the helpfulness of the review sent.compare$votes_up <- rev.merged$votes_up sent.count <- table(sent.compare$sent.label) sent.count <- table(sent.compare$label.manual) # Displaying the counts print(sent.count) # Creating Latex table sent.count <- xtable(sent.count, caption = "Distribution of Sentiments") # Calculating an average helpfulness for each sentiment label avg.sent <- tapply( sent.compare$votes_up, sent.compare$sent.label, sent.compare$label.manual, mean, na.rm = TRUE ) # Converting avg.sent to a data frame avg.sent <- data.frame(Sentiment = names(avg.sent), Average_Helpful_Votes = avg.sent) # Creating LaTeX table avg.sent <- xtable(avg.sent, caption = "Average Helpful Votes by Sentiment") # Print results print(avg.sent) Loading @@ -238,54 +269,42 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe "sentiment and its' helpfulness rank is", max.helpfulness.review$votes_up, "which is at",max.help.index) # I crossed check the review in rev.merged dataframe and I found that its sentiment label with # all types of sentiment analysis and it's sentiment is being neutral in all. # Q3 # Calculating the correlation between average sentiment scores and overall review scores # Q3 Checking the resembleness of overall review sentiment and sentiment score # An average sentiment score of reviews based of manual sentiment analysis avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE) print(avg.sent.score) avg.sent.score <- mean(rev.sentiment$score.manual, na.rm = TRUE) cat("An average sentiment score of overall review is",avg.sent.score) # A review score based from revsummary file, it was dowanloaded in the 1st assisgnement overall.review.score <- revsummary$review_score print(overall.review.score) # Calculating correlation between an average sentiment score and overall review score # correlation <- cor(avg.sent.score, overall.review.score) # Printing the correlation coefficient # print(paste("Correlation between average sentiment and overall score:", correlation)) cat("The overall review score is", overall.review.score) # Q4 Sentiment variation ------------------------------------------------------- # adding column of date from rev.merged dataframe into sent.compare dataframe sent.compare$date <- rev.merged$timestamp_created # adding a column of sent score from rev.merged into sent.compare sent.compare$sent.score <- rev.merged$sent.score # Changing class of date column sent.compare$date <- as.Date(sent.compare$date) # adding a column of sent score from rev.merged into sent.compare sent.compare$score.manual <- rev.merged$score.manual # Aggregating total sentiment scores by date total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum) total.score.date <- tapply(sent.compare$score.manual, sent.compare$date, sum) # Converting into dataframe structure total.score.date <- data.frame(date = as.Date(names(total.score.date)), total_sentiment = as.numeric(total.score.date)) # Plotting of sentiment score over a period of time ggplot(data = data.frame(date = as.Date(names(total.score.date)), sentiment.score = as.numeric(total.score.date))) + geom_line(aes(x = date, y = sentiment.score), color = "blue") + labs(title = "Sentiment Variation Over Time", x = "Date", ggplot(data = total.score.date) + geom_line(aes(x = date, y = total_sentiment), color = "blue") + labs(x = "Date", y = "Total Sentiment Score") + theme_minimal() ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg", ggsave(file= "~/ADS/introads_ass2_team18/03_report/graphs/Sentiment variation.jpg", width=15, height=15, units = "cm", dpi=1600) # Finding the index of the review with the highest helpful votes Loading @@ -296,4 +315,5 @@ high.count <- total.score.date[high.index,] print(high.count) # Printing a result cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d")) cat("A highest sentiment score of", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d"))