Loading 02_code/R/Script 2 - Sentiment analysis.R +64 −4 Original line number Diff line number Diff line Loading @@ -9,6 +9,9 @@ install.packages("ggplot2") install.packages("dyplr") install.packages("lpSolve") install.packages("irr") install.packages("stats") install.packages("lubridate") # Loading of packages Loading @@ -20,12 +23,12 @@ library(ggplot2) library(dplyr) library(lpSolve) library(irr) library(stats) library(lubridate) #loading necessary dataset load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass1_team18-main/01_data/raw/revsummary.RData.csv") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") Loading Loading @@ -233,3 +236,60 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe # all types of sentiment analysis and it's sentiment is being neutral in all. # Q3 # Calculating the correlation between average sentiment scores and overall review scores # An average sentiment score of reviews based of manual sentiment analysis avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE) print(avg.sent.score) # A review score based from revsummary file, it was dowanloaded in the 1st assisgnement overall.review.score <- revsummary$review_score # Calculating correlation betwwen an average sentiment score and overall review score correlation <- cor(avg.sent.score, overall.review.score) # Printing the correlation coefficient print(paste("Correlation between average sentiment and overall score:", correlation)) # Q4 Sentiment variation ------------------------------------------------------- # adding column of date from rev.merged dataframe into sent.compare dataframe sent.compare$date <- rev.merged$timestamp_created # adding a column of sent score from rev.merged into sent.compare sent.compare$sent.score <- rev.merged$sent.score # Changing class of date column sent.compare$date <- as.Date(sent.compare$date) # Aggregating total sentiment scores by date total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum) total.score.date <- data.frame(date = as.Date(names(total.score.date)), total_sentiment = as.numeric(total.score.date)) # Plotting of sentiment score over a period of time ggplot(data = data.frame(date = as.Date(names(total.score.date)), sentiment.score = as.numeric(total.score.date))) + geom_line(aes(x = date, y = sentiment.score), color = "blue") + labs(title = "Sentiment Variation Over Time", x = "Date", y = "Total Sentiment Score") + theme_minimal() ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg", width=15, height=15, units = "cm", dpi=1600) # checking on highest sentiment date high.index <- which.ma # Finding the index of the review with the highest helpful votes high.index <- which.max(total.score.date$total_sentiment) # Getting the corresponding review high.count <- total.score.date[high.index,] print(high.count) # Printing a result cat("A highest sentiment score", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d")) Loading
02_code/R/Script 2 - Sentiment analysis.R +64 −4 Original line number Diff line number Diff line Loading @@ -9,6 +9,9 @@ install.packages("ggplot2") install.packages("dyplr") install.packages("lpSolve") install.packages("irr") install.packages("stats") install.packages("lubridate") # Loading of packages Loading @@ -20,12 +23,12 @@ library(ggplot2) library(dplyr) library(lpSolve) library(irr) library(stats) library(lubridate) #loading necessary dataset load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass1_team18-main/01_data/raw/revsummary.RData.csv") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/rev.summary.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") Loading Loading @@ -233,3 +236,60 @@ cat("A review with the highest helpfulness is", max.helpfulness.review$sent.labe # all types of sentiment analysis and it's sentiment is being neutral in all. # Q3 # Calculating the correlation between average sentiment scores and overall review scores # An average sentiment score of reviews based of manual sentiment analysis avg.sent.score <- mean(rev.sentiment$sent.score, na.rm = TRUE) print(avg.sent.score) # A review score based from revsummary file, it was dowanloaded in the 1st assisgnement overall.review.score <- revsummary$review_score # Calculating correlation betwwen an average sentiment score and overall review score correlation <- cor(avg.sent.score, overall.review.score) # Printing the correlation coefficient print(paste("Correlation between average sentiment and overall score:", correlation)) # Q4 Sentiment variation ------------------------------------------------------- # adding column of date from rev.merged dataframe into sent.compare dataframe sent.compare$date <- rev.merged$timestamp_created # adding a column of sent score from rev.merged into sent.compare sent.compare$sent.score <- rev.merged$sent.score # Changing class of date column sent.compare$date <- as.Date(sent.compare$date) # Aggregating total sentiment scores by date total.score.date <- tapply(sent.compare$sent.score, sent.compare$date, sum) total.score.date <- data.frame(date = as.Date(names(total.score.date)), total_sentiment = as.numeric(total.score.date)) # Plotting of sentiment score over a period of time ggplot(data = data.frame(date = as.Date(names(total.score.date)), sentiment.score = as.numeric(total.score.date))) + geom_line(aes(x = date, y = sentiment.score), color = "blue") + labs(title = "Sentiment Variation Over Time", x = "Date", y = "Total Sentiment Score") + theme_minimal() ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Sentiment variation.jpg", width=15, height=15, units = "cm", dpi=1600) # checking on highest sentiment date high.index <- which.ma # Finding the index of the review with the highest helpful votes high.index <- which.max(total.score.date$total_sentiment) # Getting the corresponding review high.count <- total.score.date[high.index,] print(high.count) # Printing a result cat("A highest sentiment score", high.count$total_sentiment, "was recorded on", format(high.count$date, "%Y-%m-%d"))