Initial commit (4dcf5d6b) · Commits · Suraj Gupta / PoDS_Ass_Team18

PoDS_Ass_Team18/.DS_Store

0 → 100644

+14 KiB

File added.

No diff preview for this file type.

View file

PoDS_Ass_Team18/.RData

0 → 100644

+850 KiB

File added.

No diff preview for this file type.

View file

PoDS_Ass_Team18/.Rhistory

0 → 100644

+512 −0

Original line number	Diff line number	Diff line
		nrc_denom <- reviews_final$nrc_positive + reviews_final$nrc_negative
		reviews_final$sent_nrc <- ifelse(nrc_denom == 0, 0,
		round((reviews_final$nrc_positive - reviews_final$nrc_negative) / nrc_denom, 2))
		emotion_cols <- c("anger","anticipation","disgust","fear","joy","sadness","surprise","trust")
		for (ec in emotion_cols) reviews_final[[paste0("nrc_", ec)]] <- nrc_mat[[ec]]
		# =============================================================================
		# 7) Summary stats (needed for report questions)
		# =============================================================================
		overall_positive_share <- as.numeric(revsummary$total_positive) / as.numeric(revsummary$total_reviews)
		avg_sent_manual <- mean(reviews_final$sent_manual, na.rm = TRUE)
		avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		avg_sent_nrc <- mean(reviews_final$sent_nrc, na.rm = TRUE)
		avg_sent_llm <- mean(reviews_final$sent_llm, na.rm = TRUE)
		corr_manual_vs_nrc <- cor(reviews_final$sent_manual, reviews_final$sent_nrc, use = "complete.obs")
		corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing, reviews_final$sent_nrc, use = "complete.obs")
		corr_llm_vs_manual <- cor(reviews_final$sent_llm, reviews_final$sent_manual, use = "complete.obs")
		# =============================================================================
		# 7) Summary stats (needed for report questions)
		# =============================================================================
		overall_positive_share <- as.numeric(revsummary$total_positive) / as.numeric(revsummary$total_reviews)
		avg_sent_manual <- mean(reviews_final$sent_manual, na.rm = TRUE)
		avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		avg_sent_nrc <- mean(reviews_final$sent_nrc, na.rm = TRUE)
		avg_sent_llm <- mean(reviews_final$sent_llm, na.rm = TRUE)
		corr_manual_vs_nrc <- cor(reviews_final$sent_manual, reviews_final$sent_nrc, use = "complete.obs")
		corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing, reviews_final$sent_nrc, use = "complete.obs")
		corr_llm_vs_manual <- cor(reviews_final$sent_llm, reviews_final$sent_manual, use = "complete.obs")
		if (!"sent_llm" %in% names(reviews_final)) {
		reviews_final$sent_llm <- NA_real_
		}
		overall_positive_share <- as.numeric(revsummary$total_positive) /
		as.numeric(revsummary$total_reviews)
		avg_sent_manual <- mean(reviews_final$sent_manual, na.rm = TRUE)
		avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		avg_sent_nrc <- mean(reviews_final$sent_nrc, na.rm = TRUE)
		avg_sent_llm <- mean(reviews_final$sent_llm, na.rm = TRUE)
		corr_manual_vs_syuzhet <- cor(reviews_final$sent_manual, reviews_final$sent_syuzhet_bing,use = "complete.obs")
		corr_manual_vs_nrc <- cor(reviews_final$sent_manual,
		reviews_final$sent_nrc,
		use = "complete.obs")
		corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing,
		reviews_final$sent_nrc,
		use = "complete.obs")
		corr_llm_vs_manual <- cor(reviews_final$sent_llm,
		reviews_final$sent_manual,
		use = "complete.obs")
		# -------------------------------
		# Helper function: safe correlation
		# -------------------------------
		# cor() fails if there are no rows where BOTH variables are non-missing.
		# This function checks first and returns NA if correlation is impossible.
		safe_cor <- function(x, y) {
		ok <- complete.cases(x, y) # rows where both x and y are available
		if (sum(ok) < 2) return(NA_real_) # need at least 2 observations
		cor(x[ok], y[ok])
		}
		# ------------------------------------------------
		# Ensure LLM sentiment column exists
		# ------------------------------------------------
		# The LLM merge may be disabled. To avoid errors in
		# later calculations, we explicitly create sent_llm
		# and fill it with NA if it does not exist.
		if (!"sent_llm" %in% names(reviews_final)) {
		reviews_final$sent_llm <- NA_real_
		}
		# ------------------------------------------------
		# Overall positivity share from review summary
		# ------------------------------------------------
		# This uses the aggregated review statistics (revsummary)
		# and is independent of text-based sentiment analysis.
		overall_positive_share <- as.numeric(revsummary$total_positive) /
		as.numeric(revsummary$total_reviews)
		# ------------------------------------------------
		# Average sentiment scores by method
		# ------------------------------------------------
		# na.rm = TRUE ensures missing values do not affect means.
		avg_sent_manual <- mean(reviews_final$sent_manual, na.rm = TRUE)
		avg_sent_syuzhet <- mean(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		avg_sent_nrc <- mean(reviews_final$sent_nrc, na.rm = TRUE)
		avg_sent_llm <- mean(reviews_final$sent_llm, na.rm = TRUE)
		# ------------------------------------------------
		# Correlations between lexicon-based methods
		# ------------------------------------------------
		# These correlations are computed using only complete
		# observation pairs.
		corr_manual_vs_syuzhet <- cor(reviews_final$sent_manual,
		reviews_final$sent_syuzhet_bing,
		use = "complete.obs")
		corr_manual_vs_nrc <- cor(reviews_final$sent_manual,
		reviews_final$sent_nrc,
		use = "complete.obs")
		corr_syuzhet_vs_nrc <- cor(reviews_final$sent_syuzhet_bing,
		reviews_final$sent_nrc,
		use = "complete.obs")
		# ------------------------------------------------
		# Correlations involving LLM sentiment
		# ------------------------------------------------
		# Since LLM sentiment may be missing for all reviews,
		# we use the safe_cor() helper to avoid runtime errors.
		corr_llm_vs_manual <- safe_cor(reviews_final$sent_llm,
		reviews_final$sent_manual)
		corr_llm_vs_nrc <- safe_cor(reviews_final$sent_llm,
		reviews_final$sent_nrc)
		# Initialize result as NA (default if voted_up is missing)
		align_votedup <- NA_real_
		# Proceed only if the voted_up column exists
		if ("voted_up" %in% names(reviews_final)) {
		# Convert manual sentiment into a binary prediction:
		# TRUE = positive sentiment
		# FALSE = neutral or negative sentiment
		pred_pos <- reviews_final$sent_manual > 0
		# Compare predicted sentiment sign with actual voted_up value
		# mean(TRUE) gives the share of correct alignments
		align_votedup <- mean(pred_pos == as.logical(reviews_final$voted_up),
		na.rm = TRUE)
		}
		# Initialize outputs (default if votes_up is missing)
		extreme_help_corr <- NA_real_
		extreme_help_bins <- NULL
		# Proceed only if helpfulness votes are available
		if ("votes_up" %in% names(reviews_final)) {
		# ------------------------------------------------
		# Correlation between sentiment extremeness and votes_up
		# ------------------------------------------------
		# Extremeness is defined as the absolute value of sentiment,
		# regardless of being positive or negative.
		extreme_help_corr <- cor(abs(reviews_final$sent_manual),
		reviews_final$votes_up,
		use = "complete.obs")
		# ------------------------------------------------
		# Group reviews into low / mid / high extremeness
		# ------------------------------------------------
		# Quantile-based bins ensure roughly balanced groups.
		q <- quantile(abs(reviews_final$sent_manual),
		probs = c(0.33, 0.66),
		na.rm = TRUE)
		bin <- cut(abs(reviews_final$sent_manual),
		breaks = c(-Inf, q[1], q[2], Inf),
		labels = c("low", "mid", "high"))
		# ------------------------------------------------
		# Average helpful votes per extremeness group
		# ------------------------------------------------
		extreme_help_bins <- aggregate(reviews_final$votes_up,
		by = list(bin = bin),
		FUN = mean,
		na.rm = TRUE)
		}
		monthly_sentiment <- NULL
		# Proceed only if review timestamps are available
		if ("timestamp_created" %in% names(reviews_final)) {
		# ------------------------------------------------
		# Create date and month variables
		# ------------------------------------------------
		reviews_final$day_created <- as.Date(reviews_final$timestamp_created)
		reviews_final$month_created <- format(reviews_final$day_created, "%Y-%m")
		# ------------------------------------------------
		# Compute monthly mean sentiment for each method
		# ------------------------------------------------
		monthly_sentiment <- aggregate(
		reviews_final[, c("sent_manual",
		"sent_syuzhet_bing",
		"sent_nrc",
		"sent_llm")],
		by = list(month = reviews_final$month_created),
		FUN = mean,
		na.rm = TRUE
		)
		# Ensure chronological order for plotting
		monthly_sentiment <- monthly_sentiment[order(monthly_sentiment$month), ]
		}
		# ------------------------------------------------
		# Density plot: Manual vs Syuzhet-Bing vs NRC
		# ------------------------------------------------
		# Density plots are appropriate because sentiment scores are:
		# - continuous
		# - bounded between -1 and 1
		# - often non-normally distributed
		pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
		width = 9, height = 5)
		plot(density(reviews_final$sent_manual, na.rm = TRUE),
		main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
		xlab = "Sentiment score",
		ylab = "Density",
		col = "green")
		lines(density(reviews_final$sent_syuzhet_bing, na.rm = TRUE),
		col = "red")
		lines(density(reviews_final$sent_nrc, na.rm = TRUE),
		col = "blue")
		legend("topright",
		legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
		col = c("green", "red", "blue"),
		lty = 1,
		bty = "n")
		dev.off()
		# ------------------------------------------------
		# NRC emotion distribution (bar plot)
		# ------------------------------------------------
		# Aggregate total counts of each emotion across all reviews.
		# Negative emotions are shown in grey, positive emotions in yellow.
		emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
		na.rm = TRUE)
		neg_emotions <- c("anger", "disgust", "fear", "sadness")
		emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
		"grey70", "yellow")
		pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
		width = 9, height = 5)
		barplot(emotion_totals,
		main = "NRC Emotion Counts (Top 1,000 reviews if limited)",
		xlab = "Emotion",
		ylab = "Total count",
		las = 2,
		col = emo_colors)
		# ------------------------------------------------
		# Monthly sentiment trend plot
		# ------------------------------------------------
		# Only produced if at least two months of data exist.
		if (!is.null(monthly_sentiment) && nrow(monthly_sentiment) >= 2) {
		xax <- seq_len(nrow(monthly_sentiment))
		pdf(file.path(GRAPH_DIR, "task5_monthly_sentiment.pdf"),
		width = 9, height = 5)
		plot(xax, monthly_sentiment$sent_manual,
		type = "l",
		main = "Average Sentiment Over Time (Monthly)",
		xlab = "Month",
		ylab = "Average sentiment",
		xaxt = "n",
		col = "green")
		axis(1, at = xax,
		labels = monthly_sentiment$month,
		las = 2,
		cex.axis = 0.7)
		lines(xax, monthly_sentiment$sent_syuzhet_bing, col = "red")
		lines(xax, monthly_sentiment$sent_nrc, col = "blue")
		legend("topright",
		legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
		col = c("green", "red", "blue"),
		lty = 1,
		bty = "n")
		dev.off()
		} else {
		message("Monthly sentiment plot skipped: < 2 months of data.")
		}
		# ------------------------------------------------
		# Save review-level data with sentiment scores
		# ------------------------------------------------
		save(reviews_final,
		file = file.path(DATA_DIR,
		"reviews_final_with_sentiment.RData"))
		# ------------------------------------------------
		# Collect all key results into a single list
		# ------------------------------------------------
		task5.summary <- list(
		overall_positive_share = overall_positive_share,
		avg_sent_manual = avg_sent_manual,
		avg_sent_syuzhet = avg_sent_syuzhet,
		avg_sent_nrc = avg_sent_nrc,
		avg_sent_llm = avg_sent_llm,
		corr_manual_vs_syuzhet = corr_manual_vs_syuzhet,
		corr_manual_vs_nrc = corr_manual_vs_nrc,
		corr_syuzhet_vs_nrc = corr_syuzhet_vs_nrc,
		corr_llm_vs_manual = corr_llm_vs_manual,
		corr_llm_vs_nrc = corr_llm_vs_nrc,
		align_votedup = align_votedup,
		extreme_help_corr = extreme_help_corr,
		extreme_help_bins = extreme_help_bins,
		monthly_sentiment = monthly_sentiment
		)
		# ------------------------------------------------
		# Save summary object for later use
		# ------------------------------------------------
		save(task5.summary,
		file = file.path(DATA_DIR,
		"task5_summary.RData"))
		# ------------------------------------------------
		# NRC emotion distribution (bar plot)
		# ------------------------------------------------
		# Aggregate total counts of each emotion across all reviews.
		# Negative emotions are shown in grey, positive emotions in yellow.
		# Make sure emotion data exists
		emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
		na.rm = TRUE)
		# Safety check: skip plot if all zeros or NA
		if (sum(emotion_totals, na.rm = TRUE) > 0) {
		neg_emotions <- c("anger", "disgust", "fear", "sadness")
		emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
		"grey70", "gold")
		pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
		width = 10, height = 6)
		barplot(emotion_totals,
		main = "NRC Emotion Counts",
		xlab = "Emotion",
		ylab = "Total count",
		las = 2,
		col = emo_colors,
		cex.names = 0.9)
		dev.off()
		} else {
		message("NRC emotion plot skipped: no emotion counts available.")
		}
		# ------------------------------------------------
		# NRC emotion distribution (bar plot)
		# ------------------------------------------------
		# Aggregate total counts of each emotion across all reviews.
		# Negative emotions are shown in grey, positive emotions in yellow.
		# Make sure emotion data exists
		emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
		na.rm = TRUE)
		# Safety check: skip plot if all zeros or NA
		if (sum(emotion_totals, na.rm = TRUE) > 0) {
		neg_emotions <- c("anger", "disgust", "fear", "sadness")
		emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
		"grey")
		pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
		width = 10, height = 6)
		barplot(emotion_totals,
		main = "NRC Emotion Counts",
		xlab = "Emotion",
		ylab = "Total count",
		las = 2,
		col = emo_colors,
		cex.names = 0.9)
		dev.off()
		} else {
		message("NRC emotion plot skipped: no emotion counts available.")
		}
		# Safety check: skip plot if all zeros or NA
		if (sum(emotion_totals, na.rm = TRUE) > 0) {
		neg_emotions <- c("anger", "disgust", "fear", "sadness")
		emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
		"red", "green")
		pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
		width = 10, height = 6)
		barplot(emotion_totals,
		main = "NRC Emotion Counts",
		xlab = "Emotion",
		ylab = "Total count",
		las = 2,
		col = emo_colors,
		cex.names = 0.9)
		dev.off()
		} else {
		message("NRC emotion plot skipped: no emotion counts available.")
		}
		plot(density(reviews_final$sent_manual, na.rm = TRUE),
		main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
		xlab = "Sentiment score",
		ylab = "Density",
		col = "green")
		lines(density(reviews_final$sent_syuzhet_bing, na.rm = TRUE),
		col = "red")
		lines(density(reviews_final$sent_nrc, na.rm = TRUE),
		col = "blue")
		legend("topright",
		legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
		col = c("green", "red", "blue"),
		lty = 1,
		bty = "n")
		pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
		width = 10, height = 6)
		par(mar = c(5, 5, 4, 2) + 0.1) # increase left margin
		d1 <- density(reviews_final$sent_manual, na.rm = TRUE)
		d2 <- density(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		d3 <- density(reviews_final$sent_nrc, na.rm = TRUE)
		ymax <- max(d1$y, d2$y, d3$y)
		plot(d1,
		main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
		xlab = "Sentiment score",
		ylab = "Density",
		col = "green",
		ylim = c(0, ymax * 1.1),
		lwd = 2)
		lines(d2, col = "red", lwd = 2)
		lines(d3, col = "blue", lwd = 2)
		legend("topright",
		legend = c("Manual", "Syuzhet Bing", "NRC"),
		col = c("green", "red", "blue"),
		lty = 1,
		lwd = 2,
		bty = "n")
		dev.off()
		# =============================================================================
		# 11) Graphical comparison of sentiment methods
		# This section produces publication-ready plots used in the report.
		# =============================================================================
		# ------------------------------------------------
		# Density plot: Manual vs Syuzhet-Bing vs NRC
		# ------------------------------------------------
		# Density plots are appropriate because sentiment scores are:
		# - continuous
		# - bounded between -1 and 1
		# - often non-normally distributed
		pdf(file.path(GRAPH_DIR, "task5_sentiment_density.pdf"),
		width = 10, height = 6)
		par(mar = c(5, 5, 4, 2) + 0.1) # increase left margin
		d1 <- density(reviews_final$sent_manual, na.rm = TRUE)
		d2 <- density(reviews_final$sent_syuzhet_bing, na.rm = TRUE)
		d3 <- density(reviews_final$sent_nrc, na.rm = TRUE)
		ymax <- max(d1$y, d2$y, d3$y)
		plot(d1,
		main = "Sentiment Density (Manual vs Syuzhet-Bing vs NRC)",
		xlab = "Sentiment score",
		ylab = "Density",
		col = "green",
		ylim = c(0, ymax * 1.1),
		lwd = 2)
		lines(d2, col = "red", lwd = 2)
		lines(d3, col = "blue", lwd = 2)
		legend("topright",
		legend = c("Manual", "Syuzhet Bing", "NRC"),
		col = c("green", "red", "blue"),
		lty = 1,
		lwd = 2,
		bty = "n")
		dev.off()
		# ------------------------------------------------
		# NRC emotion distribution (bar plot)
		# ------------------------------------------------
		# Aggregate total counts of each emotion across all reviews.
		# Negative emotions are shown in grey, positive emotions in yellow.
		# Make sure emotion data exists
		emotion_totals <- colSums(nrc_mat[, emotion_cols, drop = FALSE],
		na.rm = TRUE)
		# Safety check: skip plot if all zeros or NA
		if (sum(emotion_totals, na.rm = TRUE) > 0) {
		neg_emotions <- c("anger", "disgust", "fear", "sadness")
		emo_colors <- ifelse(names(emotion_totals) %in% neg_emotions,
		"red", "green")
		pdf(file.path(GRAPH_DIR, "task5_nrc_emotions_bar.pdf"),
		width = 10, height = 6)
		barplot(emotion_totals,
		main = "NRC Emotion Counts",
		xlab = "Emotion",
		ylab = "Total count",
		las = 2,
		col = emo_colors,
		cex.names = 0.9,
		ylim = c(0, 1200))
		dev.off()
		} else {
		message("NRC emotion plot skipped: no emotion counts available.")
		}
		# ------------------------------------------------
		# Monthly sentiment trend plot
		# ------------------------------------------------
		# Only produced if at least two months of data exist.
		if (!is.null(monthly_sentiment) && nrow(monthly_sentiment) >= 2) {
		xax <- seq_len(nrow(monthly_sentiment))
		pdf(file.path(GRAPH_DIR, "task5_monthly_sentiment.pdf"),
		width = 9, height = 5)
		plot(xax, monthly_sentiment$sent_manual,
		type = "l",
		main = "Average Sentiment Over Time (Monthly)",
		xlab = "Month",
		ylab = "Average sentiment",
		xaxt = "n",
		col = "green")
		axis(1, at = xax,
		labels = monthly_sentiment$month,
		las = 2,
		cex.axis = 0.7)
		lines(xax, monthly_sentiment$sent_syuzhet_bing, col = "red")
		lines(xax, monthly_sentiment$sent_nrc, col = "blue")
		legend("topright",
		legend = c("Manual (green)", "Syuzhet Bing (red)", "NRC (blue)"),
		col = c("green", "red", "blue"),
		lty = 1,
		bty = "n")
		dev.off()
		} else {
		message("Monthly sentiment plot skipped: < 2 months of data.")
		}
		# =============================================================================
		# 12) Save processed data and summary outputs
		# This section stores all results needed for reproducibility and reporting.
		# =============================================================================
		# ------------------------------------------------
		# Save review-level data with sentiment scores
		# ------------------------------------------------
		save(reviews_final,
		file = file.path(DATA_DIR,
		"reviews_final_with_sentiment.RData"))
		# ------------------------------------------------
		# Save review-level data with sentiment scores
		# ------------------------------------------------
		save(reviews_final,
		file = file.path(DATA_DIR,
		"reviews_final_with_sentiment.RData"))
		# ------------------------------------------------
		# Collect all key results into a single list
		# ------------------------------------------------
		task5.summary <- list(
		overall_positive_share = overall_positive_share,
		avg_sent_manual = avg_sent_manual,
		avg_sent_syuzhet = avg_sent_syuzhet,
		avg_sent_nrc = avg_sent_nrc,
		avg_sent_llm = avg_sent_llm,
		corr_manual_vs_syuzhet = corr_manual_vs_syuzhet,
		corr_manual_vs_nrc = corr_manual_vs_nrc,
		corr_syuzhet_vs_nrc = corr_syuzhet_vs_nrc,
		corr_llm_vs_manual = corr_llm_vs_manual,
		corr_llm_vs_nrc = corr_llm_vs_nrc,
		align_votedup = align_votedup,
		extreme_help_corr = extreme_help_corr,
		extreme_help_bins = extreme_help_bins,
		monthly_sentiment = monthly_sentiment
		)
		# ------------------------------------------------
		# Save summary object for later use
		# ------------------------------------------------
		save(task5.summary,
		file = file.path(DATA_DIR,
		"task5_summary.RData"))

PoDS_Ass_Team18/00_docs/.DS_Store

0 → 100644

+8 KiB

File added.

No diff preview for this file type.

View file

PoDS_Ass_Team18/00_docs/literature/Assignment_Guidelines.pdf

0 → 100644

+192 KiB

File added.

No diff preview for this file type.

View file