Commit 0bf07c4b authored by Hetvi Ariwala's avatar Hetvi Ariwala
Browse files

Data cleaning on user reviews have been done. And word cloud has also been created.

parent 0a7202fb
Loading
Loading
Loading
Loading
+152 −3
Original line number Diff line number Diff line
rm(list = ls())

# installing packages
install.packages("hunspell")
install.packages("NLP")
install.packages("wordcloud2")
install.packages("tm")
install.packages("wordcloud2")
install.packages("SnowballC")

library(hunspell)
library(NLP)
library(wordcloud2)
library(tm)
library(wordcloud2)
library(SnowballC)

#loading of ncessary dataset
load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")
load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw")


# Cleaning of the user reviews---------------------------------------------
game.rev <- read.csv("gamereviews.RData")

# New dataframe for data cleaning
reviews.clean <- game.rev

clean_rev <- function(review) {
  cleaned_review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE)
  cleaned_review <- gsub("http[s]?://\\S+", "", cleaned_review)
  cleaned_review <- gsub("(.)\\1{2,}", "\\1 ", cleaned_review)
  cleaned_review <- gsub("n't", "not ", cleaned_review)
  cleaned_review <- gsub("\\d", " ", cleaned_review)
  cleaned_review <- gsub("[[:punct:]]", "", cleaned_review)
  cleaned_review <- gsub("\\n", " ", cleaned_review)
  cleaned_review <- gsub("^\\s+", " ", cleaned_review)
  cleaned_review <- gsub("\\s+", " ", cleaned_review)
  cleaned_review <- ifelse(nchar(cleaned_review) == 1, "", cleaned_review)
  
  return(cleaned_review)
}

reviews.clean$reviews <- sapply(reviews.clean$review, clean_rev)


#reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$reviews, perl = TRUE) # remove non-ASCII character
#reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # remove external URL
#reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # remove repeating character
#reviews.clean$reviews <- gsub("n't", "not ", reviews.clean$reviews)  # replacing n't with not with an empty space
#reviews.clean$reviews <- gsub("\\d", " ", reviews.clean$reviews) # remove numerical character
#reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # remove punctuation
#reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # replace new lines with empty space
#reviews.clean$reviews <- gsub("^\\s+", " ", reviews.clean$reviews)  # remove unnecessary empty space at the beginning
#reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # remove unncessary empty space at the end
#reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews)

reviews.clean$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done

reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced


#saving the dataset on a local computer
setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data")
write.csv(reviews.clean, file = "reviews.clean.csv")

# another dataframe for spelling correction
reviews.checked <- reviews.clean

# checking on 1st review
#review1 <- reviews.checked$review[1]

# 5.2 separating 1st review into words
#words <- unlist(strsplit(review1, " "))

# 5.3 checking if the words are spelled corectly based on US english dictionary
#check.spell <- hunspell_check(words, dict="en_US")
#print(check.spell)

# 5.4 another vector which has only correct words from review 1
#words_correct <- words[!words %in% check.spell]
#print(words_correct)

# 5.5 transforming into one string
#review1 <- paste(words_correct, collapse = " ")

# 5.6 Replacing new review with original review 1 in the dataframe
#reviews.checked$review[1] <- review1
#print(reviews.checked$review[1])

# Spell checking of all reviews---------------------------------------------

# Loop through all reviews to check spelling based on US english dictionary

for (i in 1:nrow(reviews.checked)){
  reviews <- reviews.checked$review[i]
  
  #extract words
  words <- unlist(strsplit(reviews, " "))
  
  #Spell check
  spell_check <- hunspell_check(words, dict = "en_US")
  
  # Consider only correct words
  correct_words <- words[!words %in% spell_check]
  
  # Replacing correctly spelled review with original review
  correct_review <- paste(correct_words, collapse = " ")
  
  # update the dataframe
  reviews.checked$review[i] <- correct_review
}

#reviews.checked$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done

#reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced
# turn all upper case characters into lower case character
reviews.checked$review <- tolower(reviews.checked$review)

# delete reviews with ""
reviews.final <- reviews.checked[reviews.checked$review != "",]
write.csv(reviews.final, file = "reviews.final.csv")


# word cloud--------------------------------------------------------------------
# combine all reviews into a single text
all_reviews <- paste(reviews.final$review, collapse = "")

# a vectore of all words
all.words <- unlist(strsplit(all_reviews,""))


all.words <- character()
# loop through all reviews
for (i in 1:nrow(reviews.final)) {
  #split the review into words
  tmp <- unlist(strsplit(reviews.final$review[i], " "))
  # Concatenate the words to the all.words vector
  all.words <- c(all.words, tmp)
}

# Deleting words with less than 3 letters
all.words <- all.words[nchar(all.words)>=4]

# Counting the frequency of the words
word.freq <- data.frame(table(all.words))

# creating word count
word.cloud1 <- wordcloud2(data = word.freq, size=1.2,
                          color = "random-light", backgroundColor = "black")

# remove the filling words
all.words.wof <- all.words[!(all.words %in% stopwords("english")) | all.words == "not"]

# dateframe after removing filling words
words.freq <- data.frame(table(all.words.wof))
# word clod 2
wordcloud2(data = words.freq, size=1.2,
           color = "random-light", backgroundColor = "black")

# Obaining word stems
word.stems <- wordStem(all.words.wof)

# Creating a word cloud using word stems
wordcloud2(table(word.stems), size = 1.2, color = "random-light", 
           backgroundColor = "black")