Loading 02_code/R/Script 2 - Sentiment analysis.R +65 −33 Original line number Diff line number Diff line Loading @@ -5,23 +5,27 @@ install.packages("stringr") install.packages("syuzhet") install.packages("NLP") install.packages("tm") install.packages("wordcloud") install.packages("RColorBrewer") install.packages("ggplot2") install.packages("dyplr") # Loading of packages library(tidytext) library(stringr) library(syuzhet) library(NLP) library(tm) library(wordcloud) library(RColorBrewer) library(ggplot2) library(dplyr) #loading necessary dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") reviews.clean <- read.csv("reviews.clean.csv") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") # Only keeping necessary columns for analysis rev.sentiment <- data.frame(reviews = reviews.clean$review) rev.sentiment <- data.frame(reviews = reviews.clean$reviews) # Manual sentiment analysis----------------------------------------------------- Loading @@ -31,7 +35,7 @@ setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/00_docs") positive.words <- readLines("positive-words.txt") negative.words <- readLines("negative-words.txt") # Function to perform sentiment analysis on a single review; in a code, sent=sentiment # Function to perform sentiment analysis on a single review.sent=sentiment sent.analysis <- function(review) { # Tokenize the review into words Loading @@ -56,27 +60,27 @@ sent.analysis <- function(review) { sent.score <- sent.score * -1 } # Ascribe positive (1), negative (-1), or neutral (0) sentiment # sentiment labeling positive (1), negative (-1), or neutral (0) sentiment <- ifelse(sent.score > 0, 1, ifelse(sent.score < 0, -1, 0)) return(sentiment) } # Apply sentiment analysis to all reviews in the dataframe rev.sentiment$sentiment.score <- sapply(rev.sentiment$reviews, sent.analysis) rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis) # Labeling the sentiment score with sentiment rev.sentiment$sentiment.label <- ifelse(rev.sentiment$sentiment.score == 1, "Positive", ifelse(rev.sentiment$sentiment.score == -1, "Negative", "Neutral")) rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive", ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral")) # Counting the total number reviews: positive.counts <- sum(rev.sentiment$sentiment.label == "Positive") positive.counts <- sum(rev.sentiment$sent.label == "Positive") cat("The total number of positive reviews are", positive.counts) negative.counts <- sum(rev.sentiment$sentiment.label == "Negative") negative.counts <- sum(rev.sentiment$sent.label == "Negative") cat("The total number of negative reviews are", negative.counts) neutral.counts <- sum(rev.sentiment$sentiment.label == "Neutral") neutral.counts <- sum(rev.sentiment$sent.label == "Neutral") cat("The total number of neutral reviews are", neutral.counts) # Syuzhet sentiment analysis using bing dictionary------------------------------ Loading @@ -85,17 +89,17 @@ cat("The total number of neutral reviews are", neutral.counts) rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing") # Labeling the sentiment score with sentiment rev.sentiment$sent.label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral")) # Sentiment scores by manually and with syuzhet package and chceking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sentiment.score == score.syuzhet, 1 , 0) # Sentiment scores by manually and with syuzhet package and checking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0) # Calculate the percentage of identical scores percentage.identical <- mean(rev.sentiment$identical.score) * 100 # Display the result cat("Percentage of Identical Score:", percentage.identical, "%\n") cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n") # Sentiment analysis by using other two dictionaries from syuhet package-------- Loading @@ -103,29 +107,57 @@ cat("Percentage of Identical Score:", percentage.identical, "%\n") rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afinn") # Labeling sentiment score with sentiment rev.sentiment$sent.label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) # Sentiment score using nrc dictionary rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews) # Wordcloud based on emotions--------------------------------------------------- reviews <- rev.sentiment$reviews # Emotion analysis--------------------------------------------------- # Extract the NRC emotion scores for each term # Assuming 'nrc_scores' is a column containing NRC scores for each review in your rev.sentiment dataframe nrc_scores <- rev.sentiment$score.nrc # Extracting the NRC emotion scores for each term nrc.scores <- rev.sentiment$score.nrc #Sum the NRC scores for each emotion across all reviews aggregate_nrc <- colSums(nrc_scores, na.rm = TRUE) nrc.scores <- colSums(nrc.scores, na.rm = TRUE) # Convert the aggregated scores to a data frame nrc_aggregated_df <- data.frame(emotion = names(aggregate_nrc), score = aggregate_nrc) # Remove rows with NAs (if any) nrc_aggregated_df <- nrc_aggregated_df[complete.cases(nrc_aggregated_df), ] # Create the word cloud wordcloud(words = nrc_aggregated_df$emotion, freq = nrc_aggregated_df$score, min.freq = 1, scale = c(3, 0.5), colors = brewer.pal(8, "Dark2")) nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores) # Order the dataframe by scores in descending order nrc.scores <- nrc.scores[order(-nrc.scores$score), ] # Define a custom color palette with light colors custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") # Reorder the factor levels of emotion based on scores nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion) # Bar plotting of emotions with values ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + geom_bar(stat = "identity", color = "black", width = 0.7) + geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) + scale_fill_manual(values = custom_palette) + labs(title = "Emotion Scores", x = "Emotions", y = "Scores") + theme(legend.position = "bottom", legend.box.margin = margin(3, 3, 3, 3), plot.title = element_text(hjust = 0.5, face = "bold"), # Bolds the title text axis.title.x = element_text(face = "bold"), # Bolds the x-axis title text axis.title.y = element_text(face = "bold"), # Bolds the y-axis title text axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), # Bolds the x-axis text plot.margin = margin(25, 25, 25, 25), # right margin for legend legend.key.size = unit(0.5, "cm")) # size adjust for legend-key ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Emotions.jpg", width=15, height=15, units = "cm", dpi=1600) # Calculating weightage of enotions nrc.scores <- nrc.scores %>% mutate(weightage = score / sum(score) *100) cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with highest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") # Checking the consistency of sentiment with positive and negative voted_up Loading
02_code/R/Script 2 - Sentiment analysis.R +65 −33 Original line number Diff line number Diff line Loading @@ -5,23 +5,27 @@ install.packages("stringr") install.packages("syuzhet") install.packages("NLP") install.packages("tm") install.packages("wordcloud") install.packages("RColorBrewer") install.packages("ggplot2") install.packages("dyplr") # Loading of packages library(tidytext) library(stringr) library(syuzhet) library(NLP) library(tm) library(wordcloud) library(RColorBrewer) library(ggplot2) library(dplyr) #loading necessary dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") reviews.clean <- read.csv("reviews.clean.csv") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/reviews.clean.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") # Only keeping necessary columns for analysis rev.sentiment <- data.frame(reviews = reviews.clean$review) rev.sentiment <- data.frame(reviews = reviews.clean$reviews) # Manual sentiment analysis----------------------------------------------------- Loading @@ -31,7 +35,7 @@ setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/00_docs") positive.words <- readLines("positive-words.txt") negative.words <- readLines("negative-words.txt") # Function to perform sentiment analysis on a single review; in a code, sent=sentiment # Function to perform sentiment analysis on a single review.sent=sentiment sent.analysis <- function(review) { # Tokenize the review into words Loading @@ -56,27 +60,27 @@ sent.analysis <- function(review) { sent.score <- sent.score * -1 } # Ascribe positive (1), negative (-1), or neutral (0) sentiment # sentiment labeling positive (1), negative (-1), or neutral (0) sentiment <- ifelse(sent.score > 0, 1, ifelse(sent.score < 0, -1, 0)) return(sentiment) } # Apply sentiment analysis to all reviews in the dataframe rev.sentiment$sentiment.score <- sapply(rev.sentiment$reviews, sent.analysis) rev.sentiment$sent.score <- sapply(rev.sentiment$reviews, sent.analysis) # Labeling the sentiment score with sentiment rev.sentiment$sentiment.label <- ifelse(rev.sentiment$sentiment.score == 1, "Positive", ifelse(rev.sentiment$sentiment.score == -1, "Negative", "Neutral")) rev.sentiment$sent.label <- ifelse(rev.sentiment$sent.score == 1, "Positive", ifelse(rev.sentiment$sent.score == -1, "Negative", "Neutral")) # Counting the total number reviews: positive.counts <- sum(rev.sentiment$sentiment.label == "Positive") positive.counts <- sum(rev.sentiment$sent.label == "Positive") cat("The total number of positive reviews are", positive.counts) negative.counts <- sum(rev.sentiment$sentiment.label == "Negative") negative.counts <- sum(rev.sentiment$sent.label == "Negative") cat("The total number of negative reviews are", negative.counts) neutral.counts <- sum(rev.sentiment$sentiment.label == "Neutral") neutral.counts <- sum(rev.sentiment$sent.label == "Neutral") cat("The total number of neutral reviews are", neutral.counts) # Syuzhet sentiment analysis using bing dictionary------------------------------ Loading @@ -85,17 +89,17 @@ cat("The total number of neutral reviews are", neutral.counts) rev.sentiment$score.syuzhet <- get_sentiment(rev.sentiment$reviews, method = "bing") # Labeling the sentiment score with sentiment rev.sentiment$sent.label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", rev.sentiment$label.syuzhet <- ifelse(rev.sentiment$score.syuzhet >= 1, "Positive", ifelse(rev.sentiment$score.syuzhet <= -1, "Negative", "Neutral")) # Sentiment scores by manually and with syuzhet package and chceking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sentiment.score == score.syuzhet, 1 , 0) # Sentiment scores by manually and with syuzhet package and checking if its identical or not rev.sentiment$identical.score <- ifelse(rev.sentiment$sent.score == rev.sentiment$score.syuzhet, 1 , 0) # Calculate the percentage of identical scores percentage.identical <- mean(rev.sentiment$identical.score) * 100 # Display the result cat("Percentage of Identical Score:", percentage.identical, "%\n") cat("Percentage of Identical Score:", round(percentage.identical,2), "%\n") # Sentiment analysis by using other two dictionaries from syuhet package-------- Loading @@ -103,29 +107,57 @@ cat("Percentage of Identical Score:", percentage.identical, "%\n") rev.sentiment$score.afinn <- get_sentiment(rev.sentiment$reviews, method = "afinn") # Labeling sentiment score with sentiment rev.sentiment$sent.label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", rev.sentiment$label.afinn <- ifelse(rev.sentiment$score.afinn >= 1, "Positive", ifelse(rev.sentiment$score.afinn <= -1, "Negative", "Neutral")) # Sentiment score using nrc dictionary rev.sentiment$score.nrc <- get_nrc_sentiment(rev.sentiment$reviews) # Wordcloud based on emotions--------------------------------------------------- reviews <- rev.sentiment$reviews # Emotion analysis--------------------------------------------------- # Extract the NRC emotion scores for each term # Assuming 'nrc_scores' is a column containing NRC scores for each review in your rev.sentiment dataframe nrc_scores <- rev.sentiment$score.nrc # Extracting the NRC emotion scores for each term nrc.scores <- rev.sentiment$score.nrc #Sum the NRC scores for each emotion across all reviews aggregate_nrc <- colSums(nrc_scores, na.rm = TRUE) nrc.scores <- colSums(nrc.scores, na.rm = TRUE) # Convert the aggregated scores to a data frame nrc_aggregated_df <- data.frame(emotion = names(aggregate_nrc), score = aggregate_nrc) # Remove rows with NAs (if any) nrc_aggregated_df <- nrc_aggregated_df[complete.cases(nrc_aggregated_df), ] # Create the word cloud wordcloud(words = nrc_aggregated_df$emotion, freq = nrc_aggregated_df$score, min.freq = 1, scale = c(3, 0.5), colors = brewer.pal(8, "Dark2")) nrc.scores <- data.frame(emotion = names(nrc.scores), score = nrc.scores) # Order the dataframe by scores in descending order nrc.scores <- nrc.scores[order(-nrc.scores$score), ] # Define a custom color palette with light colors custom_palette <- c("#a1d99b", "#f03b20", "#FFC3A0", "#FF677D", "#D4A5A5", "#fec44f", "#e6550d", "#fc9272", "#9ecae1", "#bdbdbd") # Reorder the factor levels of emotion based on scores nrc.scores$emotion <- factor(nrc.scores$emotion, levels = nrc.scores$emotion) # Bar plotting of emotions with values ggplot(nrc.scores, aes(x = emotion, y = score, fill = emotion)) + geom_bar(stat = "identity", color = "black", width = 0.7) + geom_text(aes(label = round(score, 2)), vjust = 1.5, color = "black", size = 3) + scale_fill_manual(values = custom_palette) + labs(title = "Emotion Scores", x = "Emotions", y = "Scores") + theme(legend.position = "bottom", legend.box.margin = margin(3, 3, 3, 3), plot.title = element_text(hjust = 0.5, face = "bold"), # Bolds the title text axis.title.x = element_text(face = "bold"), # Bolds the x-axis title text axis.title.y = element_text(face = "bold"), # Bolds the y-axis title text axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), # Bolds the x-axis text plot.margin = margin(25, 25, 25, 25), # right margin for legend legend.key.size = unit(0.5, "cm")) # size adjust for legend-key ggsave(file= "C:/Users/hetvi/OneDrive/Desktop/Emotions.jpg", width=15, height=15, units = "cm", dpi=1600) # Calculating weightage of enotions nrc.scores <- nrc.scores %>% mutate(weightage = score / sum(score) *100) cat("An emotion with highest weightage is positive with", round(nrc.scores$weightage[1],2),"%\n") cat("An emotion with highest weightage is disgust with", round(nrc.scores$weightage[10],2),"%\n") # Checking the consistency of sentiment with positive and negative voted_up