Commit 4dcf5d6b authored by guptasuuraj's avatar guptasuuraj
Browse files

Initial commit

parents
Loading
Loading
Loading
Loading
+14 KiB

File added.

No diff preview for this file type.

PoDS_Ass_Team18/.RData

0 → 100644
+850 KiB

File added.

No diff preview for this file type.

+512 −0
Original line number Diff line number Diff line
nrc_denom <- reviews_final$nrc_positive + reviews_final$nrc_negative
reviews_final$sent_nrc <- ifelse(nrc_denom == 0, 0,
round((reviews_final$nrc_positive - reviews_final$nrc_negative) / nrc_denom, 2))
emotion_cols <- c("anger","anticipation","disgust","fear","joy","sadness","surprise","trust")
for (ec in emotion_cols) reviews_final[[paste0("nrc_", ec)]] <- nrc_mat[[ec]]
# =============================================================================
# 7) Summary stats (needed for report questions)
# =============================================================================
overall_positive_share <- as.numeric(revsummary$total_positive) / as.numeric(revsummary$total_reviews)
avg_sent_manual  <- mean(reviews_final$sent_manual, na.rm = TRUE)
avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
avg_sent_nrc     <- mean(reviews_final$sent_nrc, na.rm = TRUE)
avg_sent_llm     <- mean(reviews_final$sent_llm, na.rm = TRUE)
corr_manual_vs_nrc   <- cor(reviews_final$sent_manual, reviews_final$sent_nrc, use = "complete.obs")
corr_syuzhet_vs_nrc  <- cor(reviews_final$sent_syuzhet_bing, reviews_final$sent_nrc, use = "complete.obs")
corr_llm_vs_manual   <- cor(reviews_final$sent_llm, reviews_final$sent_manual, use = "complete.obs")
# =============================================================================
# 7) Summary stats (needed for report questions)
# =============================================================================
overall_positive_share <- as.numeric(revsummary$total_positive) / as.numeric(revsummary$total_reviews)
avg_sent_manual  <- mean(reviews_final$sent_manual, na.rm = TRUE)
avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
avg_sent_nrc     <- mean(reviews_final$sent_nrc, na.rm = TRUE)
avg_sent_llm     <- mean(reviews_final$sent_llm, na.rm = TRUE)
corr_manual_vs_nrc   <- cor(reviews_final$sent_manual, reviews_final$sent_nrc, use = "complete.obs")
corr_syuzhet_vs_nrc  <- cor(reviews_final$sent_syuzhet_bing, reviews_final$sent_nrc, use = "complete.obs")
corr_llm_vs_manual   <- cor(reviews_final$sent_llm, reviews_final$sent_manual, use = "complete.obs")
if (!"sent_llm" %in% names(reviews_final)) {
reviews_final$sent_llm <- NA_real_
}
overall_positive_share <- as.numeric(revsummary$total_positive) /
as.numeric(revsummary$total_reviews)
avg_sent_manual  <- mean(reviews_final$sent_manual, na.rm = TRUE)
avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
avg_sent_nrc     <- mean(reviews_final$sent_nrc, na.rm = TRUE)
avg_sent_llm     <- mean(reviews_final$sent_llm, na.rm = TRUE)
corr_manual_vs_syuzhet <- cor(reviews_final$sent_manual, reviews_final$sent_syuzhet_bing,use = "complete.obs")
corr_manual_vs_nrc <- cor(reviews_final$sent_manual,
reviews_final$sent_nrc,
use = "complete.obs")
corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing,
reviews_final$sent_nrc,
use = "complete.obs")
corr_llm_vs_manual <- cor(reviews_final$sent_llm,
reviews_final$sent_manual,
use = "complete.obs")
# -------------------------------
# Helper function: safe correlation
# -------------------------------
# cor() fails if there are no rows where BOTH variables are non-missing.
# This function checks first and returns NA if correlation is impossible.
safe_cor <- function(x, y) {
ok <- complete.cases(x, y)   # rows where both x and y are available
if (sum(ok) < 2) return(NA_real_)  # need at least 2 observations
cor(x[ok], y[ok])
}
# ------------------------------------------------
# Ensure LLM sentiment column exists
# ------------------------------------------------
# The LLM merge may be disabled. To avoid errors in
# later calculations, we explicitly create sent_llm
# and fill it with NA if it does not exist.
if (!"sent_llm" %in% names(reviews_final)) {
reviews_final$sent_llm <- NA_real_
}
# ------------------------------------------------
# Overall positivity share from review summary
# ------------------------------------------------
# This uses the aggregated review statistics (revsummary)
# and is independent of text-based sentiment analysis.
overall_positive_share <- as.numeric(revsummary$total_positive) /
as.numeric(revsummary$total_reviews)
# ------------------------------------------------
# Average sentiment scores by method
# ------------------------------------------------
# na.rm = TRUE ensures missing values do not affect means.
avg_sent_manual  <- mean(reviews_final$sent_manual, na.rm = TRUE)
avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
avg_sent_nrc     <- mean(reviews_final$sent_nrc, na.rm = TRUE)
avg_sent_llm     <- mean(reviews_final$sent_llm, na.rm = TRUE)
# ------------------------------------------------
# Correlations between lexicon-based methods
# ------------------------------------------------
# These correlations are computed using only complete
# observation pairs.
corr_manual_vs_syuzhet <- cor(reviews_final$sent_manual,
reviews_final$sent_syuzhet_bing,
use = "complete.obs")
corr_manual_vs_nrc <- cor(reviews_final$sent_manual,
reviews_final$sent_nrc,
use = "complete.obs")
corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing,
reviews_final$sent_nrc,
use = "complete.obs")
# ------------------------------------------------
# Correlations involving LLM sentiment
# ------------------------------------------------
# Since LLM sentiment may be missing for all reviews,
# we use the safe_cor() helper to avoid runtime errors.
corr_llm_vs_manual <- safe_cor(reviews_final$sent_llm,
reviews_final$sent_manual)
corr_llm_vs_nrc <- safe_cor(reviews_final$sent_llm,
reviews_final$sent_nrc)
# Initialize result as NA (default if voted_up is missing)
align_votedup <- NA_real_
# Proceed only if the voted_up column exists
if ("voted_up" %in% names(reviews_final)) {
# Convert manual sentiment into a binary prediction:
# TRUE  = positive sentiment
# FALSE = neutral or negative sentiment
pred_pos <- reviews_final$sent_manual > 0
# Compare predicted sentiment sign with actual voted_up value
# mean(TRUE) gives the share of correct alignments
align_votedup <- mean(pred_pos == as.logical(reviews_final$voted_up),
na.rm = TRUE)
}
# Initialize outputs (default if votes_up is missing)
extreme_help_corr <- NA_real_
extreme_help_bins <- NULL
# Proceed only if helpfulness votes are available
if ("votes_up" %in% names(reviews_final)) {
# ------------------------------------------------
# Correlation between sentiment extremeness and votes_up
# ------------------------------------------------
# Extremeness is defined as the absolute value of sentiment,
# regardless of being positive or negative.
extreme_help_corr <- cor(abs(reviews_final$sent_manual),
reviews_final$votes_up,
use = "complete.obs")
# ------------------------------------------------
# Group reviews into low / mid / high extremeness
# ------------------------------------------------
# Quantile-based bins ensure roughly balanced groups.
q <- quantile(abs(reviews_final$sent_manual),
probs = c(0.33, 0.66),
na.rm = TRUE)
bin <- cut(abs(reviews_final$sent_manual),
breaks = c(-Inf, q[1], q[2], Inf),
labels = c("low", "mid", "high"))
# ------------------------------------------------
# Average helpful votes per extremeness group
# ------------------------------------------------
extreme_help_bins <- aggregate(reviews_final$votes_up,
by = list(bin = bin),
FUN = mean,
na.rm = TRUE)
}
monthly_sentiment <- NULL
# Proceed only if review timestamps are available
if ("timestamp_created" %in% names(reviews_final)) {
# ------------------------------------------------
# Create date and month variables
# ------------------------------------------------
reviews_final$day_created <- as.Date(reviews_final$timestamp_created)
reviews_final$month_created <- format(reviews_final$day_created, "%Y-%m")
# ------------------------------------------------
# Compute monthly mean sentiment for each method
# ------------------------------------------------
monthly_sentiment <- aggregate(
reviews_final[, c("sent_manual",
"sent_syuzhet_bing",
"sent_nrc",
"sent_llm")],
by = list(month = reviews_final$month_created),
FUN = mean,
na.rm = TRUE
)
# Ensure chronological order for plotting
monthly_sentiment <- monthly_sentiment[order(monthly_sentiment$month), ]
}
# ------------------------------------------------
# Density plot: Manual vs Syuzhet-Bing vs NRC
# ------------------------------------------------
# Density plots are appropriate because sentiment scores are:
# - continuous
# - bounded between -1 and 1
# - often non-normally distributed
pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
width = 9, height = 5)
plot(density(reviews_final$sent_manual, na.rm = TRUE),
main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
xlab = "Sentiment score",
ylab = "Density",
col = "green")
lines(density(reviews_final$sent_syuzhet_bing, na.rm = TRUE),
col = "red")
lines(density(reviews_final$sent_nrc, na.rm = TRUE),
col = "blue")
legend("topright",
legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
col = c("green", "red", "blue"),
lty = 1,
bty = "n")
dev.off()
# ------------------------------------------------
# NRC emotion distribution (bar plot)
# ------------------------------------------------
# Aggregate total counts of each emotion across all reviews.
# Negative emotions are shown in grey, positive emotions in yellow.
emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
na.rm = TRUE)
neg_emotions <- c("anger", "disgust", "fear", "sadness")
emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
"grey70", "yellow")
pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
width = 9, height = 5)
barplot(emotion_totals,
main = "NRC Emotion Counts (Top 1,000 reviews if limited)",
xlab = "Emotion",
ylab = "Total count",
las = 2,
col = emo_colors)
# ------------------------------------------------
# Monthly sentiment trend plot
# ------------------------------------------------
# Only produced if at least two months of data exist.
if (!is.null(monthly_sentiment) && nrow(monthly_sentiment) >= 2) {
xax <- seq_len(nrow(monthly_sentiment))
pdf(file.path(GRAPH_DIR, "task5_monthly_sentiment.pdf"),
width = 9, height = 5)
plot(xax, monthly_sentiment$sent_manual,
type = "l",
main = "Average Sentiment Over Time (Monthly)",
xlab = "Month",
ylab = "Average sentiment",
xaxt = "n",
col = "green")
axis(1, at = xax,
labels = monthly_sentiment$month,
las = 2,
cex.axis = 0.7)
lines(xax, monthly_sentiment$sent_syuzhet_bing, col = "red")
lines(xax, monthly_sentiment$sent_nrc, col = "blue")
legend("topright",
legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
col = c("green", "red", "blue"),
lty = 1,
bty = "n")
dev.off()
} else {
message("Monthly sentiment plot skipped: < 2 months of data.")
}
# ------------------------------------------------
# Save review-level data with sentiment scores
# ------------------------------------------------
save(reviews_final,
file = file.path(DATA_DIR,
"reviews_final_with_sentiment.RData"))
# ------------------------------------------------
# Collect all key results into a single list
# ------------------------------------------------
task5.summary <- list(
overall_positive_share = overall_positive_share,
avg_sent_manual = avg_sent_manual,
avg_sent_syuzhet = avg_sent_syuzhet,
avg_sent_nrc = avg_sent_nrc,
avg_sent_llm = avg_sent_llm,
corr_manual_vs_syuzhet = corr_manual_vs_syuzhet,
corr_manual_vs_nrc = corr_manual_vs_nrc,
corr_syuzhet_vs_nrc = corr_syuzhet_vs_nrc,
corr_llm_vs_manual = corr_llm_vs_manual,
corr_llm_vs_nrc = corr_llm_vs_nrc,
align_votedup = align_votedup,
extreme_help_corr = extreme_help_corr,
extreme_help_bins = extreme_help_bins,
monthly_sentiment = monthly_sentiment
)
# ------------------------------------------------
# Save summary object for later use
# ------------------------------------------------
save(task5.summary,
file = file.path(DATA_DIR,
"task5_summary.RData"))
# ------------------------------------------------
# NRC emotion distribution (bar plot)
# ------------------------------------------------
# Aggregate total counts of each emotion across all reviews.
# Negative emotions are shown in grey, positive emotions in yellow.
# Make sure emotion data exists
emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
na.rm = TRUE)
# Safety check: skip plot if all zeros or NA
if (sum(emotion_totals, na.rm = TRUE) > 0) {
neg_emotions <- c("anger", "disgust", "fear", "sadness")
emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
"grey70", "gold")
pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
width = 10, height = 6)
barplot(emotion_totals,
main = "NRC Emotion Counts",
xlab = "Emotion",
ylab = "Total count",
las = 2,
col = emo_colors,
cex.names = 0.9)
dev.off()
} else {
message("NRC emotion plot skipped: no emotion counts available.")
}
# ------------------------------------------------
# NRC emotion distribution (bar plot)
# ------------------------------------------------
# Aggregate total counts of each emotion across all reviews.
# Negative emotions are shown in grey, positive emotions in yellow.
# Make sure emotion data exists
emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
na.rm = TRUE)
# Safety check: skip plot if all zeros or NA
if (sum(emotion_totals, na.rm = TRUE) > 0) {
neg_emotions <- c("anger", "disgust", "fear", "sadness")
emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
"grey")
pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
width = 10, height = 6)
barplot(emotion_totals,
main = "NRC Emotion Counts",
xlab = "Emotion",
ylab = "Total count",
las = 2,
col = emo_colors,
cex.names = 0.9)
dev.off()
} else {
message("NRC emotion plot skipped: no emotion counts available.")
}
# Safety check: skip plot if all zeros or NA
if (sum(emotion_totals, na.rm = TRUE) > 0) {
neg_emotions <- c("anger", "disgust", "fear", "sadness")
emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
"red", "green")
pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
width = 10, height = 6)
barplot(emotion_totals,
main = "NRC Emotion Counts",
xlab = "Emotion",
ylab = "Total count",
las = 2,
col = emo_colors,
cex.names = 0.9)
dev.off()
} else {
message("NRC emotion plot skipped: no emotion counts available.")
}
plot(density(reviews_final$sent_manual, na.rm = TRUE),
main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
xlab = "Sentiment score",
ylab = "Density",
col = "green")
lines(density(reviews_final$sent_syuzhet_bing, na.rm = TRUE),
col = "red")
lines(density(reviews_final$sent_nrc, na.rm = TRUE),
col = "blue")
legend("topright",
legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
col = c("green", "red", "blue"),
lty = 1,
bty = "n")
pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
width = 10, height = 6)
par(mar = c(5, 5, 4, 2) + 0.1)  # increase left margin
d1 <- density(reviews_final$sent_manual, na.rm = TRUE)
d2 <- density(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
d3 <- density(reviews_final$sent_nrc, na.rm = TRUE)
ymax <- max(d1$y, d2$y, d3$y)
plot(d1,
main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
xlab = "Sentiment score",
ylab = "Density",
col = "green",
ylim = c(0, ymax * 1.1),
lwd = 2)
lines(d2, col = "red", lwd = 2)
lines(d3, col = "blue", lwd = 2)
legend("topright",
legend = c("Manual", "Syuzhet Bing", "NRC"),
col = c("green", "red", "blue"),
lty = 1,
lwd = 2,
bty = "n")
dev.off()
# =============================================================================
# 11) Graphical comparison of sentiment methods
# This section produces publication-ready plots used in the report.
# =============================================================================
# ------------------------------------------------
# Density plot: Manual vs Syuzhet-Bing vs NRC
# ------------------------------------------------
# Density plots are appropriate because sentiment scores are:
# - continuous
# - bounded between -1 and 1
# - often non-normally distributed
pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
width = 10, height = 6)
par(mar = c(5, 5, 4, 2) + 0.1)  # increase left margin
d1 <- density(reviews_final$sent_manual, na.rm = TRUE)
d2 <- density(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
d3 <- density(reviews_final$sent_nrc, na.rm = TRUE)
ymax <- max(d1$y, d2$y, d3$y)
plot(d1,
main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
xlab = "Sentiment score",
ylab = "Density",
col = "green",
ylim = c(0, ymax * 1.1),
lwd = 2)
lines(d2, col = "red", lwd = 2)
lines(d3, col = "blue", lwd = 2)
legend("topright",
legend = c("Manual", "Syuzhet Bing", "NRC"),
col = c("green", "red", "blue"),
lty = 1,
lwd = 2,
bty = "n")
dev.off()
# ------------------------------------------------
# NRC emotion distribution (bar plot)
# ------------------------------------------------
# Aggregate total counts of each emotion across all reviews.
# Negative emotions are shown in grey, positive emotions in yellow.
# Make sure emotion data exists
emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
na.rm = TRUE)
# Safety check: skip plot if all zeros or NA
if (sum(emotion_totals, na.rm = TRUE) > 0) {
neg_emotions <- c("anger", "disgust", "fear", "sadness")
emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
"red", "green")
pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
width = 10, height = 6)
barplot(emotion_totals,
main = "NRC Emotion Counts",
xlab = "Emotion",
ylab = "Total count",
las = 2,
col = emo_colors,
cex.names = 0.9,
ylim =  c(0, 1200))
dev.off()
} else {
message("NRC emotion plot skipped: no emotion counts available.")
}
# ------------------------------------------------
# Monthly sentiment trend plot
# ------------------------------------------------
# Only produced if at least two months of data exist.
if (!is.null(monthly_sentiment) && nrow(monthly_sentiment) >= 2) {
xax <- seq_len(nrow(monthly_sentiment))
pdf(file.path(GRAPH_DIR, "task5_monthly_sentiment.pdf"),
width = 9, height = 5)
plot(xax, monthly_sentiment$sent_manual,
type = "l",
main = "Average Sentiment Over Time (Monthly)",
xlab = "Month",
ylab = "Average sentiment",
xaxt = "n",
col = "green")
axis(1, at = xax,
labels = monthly_sentiment$month,
las = 2,
cex.axis = 0.7)
lines(xax, monthly_sentiment$sent_syuzhet_bing, col = "red")
lines(xax, monthly_sentiment$sent_nrc, col = "blue")
legend("topright",
legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
col = c("green", "red", "blue"),
lty = 1,
bty = "n")
dev.off()
} else {
message("Monthly sentiment plot skipped: < 2 months of data.")
}
# =============================================================================
# 12) Save processed data and summary outputs
# This section stores all results needed for reproducibility and reporting.
# =============================================================================
# ------------------------------------------------
# Save review-level data with sentiment scores
# ------------------------------------------------
save(reviews_final,
file = file.path(DATA_DIR,
"reviews_final_with_sentiment.RData"))
# ------------------------------------------------
# Save review-level data with sentiment scores
# ------------------------------------------------
save(reviews_final,
file = file.path(DATA_DIR,
"reviews_final_with_sentiment.RData"))
# ------------------------------------------------
# Collect all key results into a single list
# ------------------------------------------------
task5.summary <- list(
overall_positive_share = overall_positive_share,
avg_sent_manual = avg_sent_manual,
avg_sent_syuzhet = avg_sent_syuzhet,
avg_sent_nrc = avg_sent_nrc,
avg_sent_llm = avg_sent_llm,
corr_manual_vs_syuzhet = corr_manual_vs_syuzhet,
corr_manual_vs_nrc = corr_manual_vs_nrc,
corr_syuzhet_vs_nrc = corr_syuzhet_vs_nrc,
corr_llm_vs_manual = corr_llm_vs_manual,
corr_llm_vs_nrc = corr_llm_vs_nrc,
align_votedup = align_votedup,
extreme_help_corr = extreme_help_corr,
extreme_help_bins = extreme_help_bins,
monthly_sentiment = monthly_sentiment
)
# ------------------------------------------------
# Save summary object for later use
# ------------------------------------------------
save(task5.summary,
file = file.path(DATA_DIR,
"task5_summary.RData"))
+8 KiB

File added.

No diff preview for this file type.

+192 KiB

File added.

No diff preview for this file type.