Commit 0613e276 authored by Hetvi Ariwala's avatar Hetvi Ariwala
Browse files

Some changes have been made for data cleaning process on user reviews.

parent e3fc8f18
Loading
Loading
Loading
Loading
+52 −90
Original line number Diff line number Diff line
@@ -14,83 +14,59 @@ library(wordcloud2)
library(SnowballC)

#loading of ncessary dataset
load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw")
load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData")


# Cleaning of the user reviews---------------------------------------------
game.rev <- read.csv("gamereviews.RData")
# Cleaning of the user reviews--------------------------------------------------

# New dataframe for data cleaning
reviews.clean <- game.rev

clean_rev <- function(review) {
  cleaned_review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE)
  cleaned_review <- gsub("http[s]?://\\S+", "", cleaned_review)
  cleaned_review <- gsub("(.)\\1{2,}", "\\1 ", cleaned_review)
  cleaned_review <- gsub("n't", "not ", cleaned_review)
  cleaned_review <- gsub("\\d", " ", cleaned_review)
  cleaned_review <- gsub("[[:punct:]]", "", cleaned_review)
  cleaned_review <- gsub("\\n", " ", cleaned_review)
  cleaned_review <- gsub("^\\s+", " ", cleaned_review)
  cleaned_review <- gsub("\\s+", " ", cleaned_review)
  cleaned_review <- ifelse(nchar(cleaned_review) == 1, "", cleaned_review)
  
  return(cleaned_review)
reviews.clean <- gamereviews

clean.review <- function(review) {
  cleaned.review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE) # remove non-ASCII character
  cleaned.review <- gsub("n't", "not ", cleaned.review) #replacing n't with not with an empty space
  cleaned.review <- gsub("http[s]?://\\S+", "", cleaned.review) # Remove URLs
  cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character
  cleaned.review <- gsub("\\d", " ", cleaned.review)  # Remove digits
  cleaned.review <- gsub("[[:punct:]]", "", cleaned.review) # Remove punctuation
  cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space
  cleaned.review <- trimws(cleaned.review) # Remove leading and trailing whitespaces
  cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space
  cleaned.review <- ifelse(nchar(cleaned.review) == 1, "", cleaned.review) # Remove single-character reviews
  
  # Remove stopwords (except "not") using tm package
  stop_words <- stopwords("english")
  stop_words <- setdiff(stop_words, "not")
  
  cleaned.review <- paste(
    unlist(
      lapply(strsplit(cleaned.review, " "), 
             function(word) ifelse(tolower(word) %in% stop_words, "", word)
      )
    ), 
    collapse = " "
  )
  
  return(cleaned.review)
}

reviews.clean$reviews <- sapply(reviews.clean$review, clean_rev)


#reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$reviews, perl = TRUE) # remove non-ASCII character
#reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # remove external URL
#reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # remove repeating character
#reviews.clean$reviews <- gsub("n't", "not ", reviews.clean$reviews)  # replacing n't with not with an empty space
#reviews.clean$reviews <- gsub("\\d", " ", reviews.clean$reviews) # remove numerical character
#reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # remove punctuation
#reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # replace new lines with empty space
#reviews.clean$reviews <- gsub("^\\s+", " ", reviews.clean$reviews)  # remove unnecessary empty space at the beginning
#reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # remove unncessary empty space at the end
#reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews)

reviews.clean$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done

reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced
# Applying function on each review which gives cleaned review
reviews.clean$review <- sapply(reviews.clean$review, clean.review)

# Exclude observations with empty spaces
reviews.clean <- reviews.clean[reviews.clean$review != "",]

#saving the dataset on a local computer
setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data")
write.csv(reviews.clean, file = "reviews.clean.csv")

# another dataframe for spelling correction
reviews.checked <- reviews.clean

# checking on 1st review
#review1 <- reviews.checked$review[1]

# 5.2 separating 1st review into words
#words <- unlist(strsplit(review1, " "))

# 5.3 checking if the words are spelled corectly based on US english dictionary
#check.spell <- hunspell_check(words, dict="en_US")
#print(check.spell)
#setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data")
#write.csv(reviews.clean, file = "reviews.clean.csv")

# 5.4 another vector which has only correct words from review 1
#words_correct <- words[!words %in% check.spell]
#print(words_correct)

# 5.5 transforming into one string
#review1 <- paste(words_correct, collapse = " ")

# 5.6 Replacing new review with original review 1 in the dataframe
#reviews.checked$review[1] <- review1
#print(reviews.checked$review[1])

# Spell checking of all reviews---------------------------------------------

# Loop through all reviews to check spelling based on US english dictionary

for (i in 1:nrow(reviews.checked)){
  reviews <- reviews.checked$review[i]
for (i in 1:nrow(reviews.clean)){
  reviews <- reviews.clean$review[i]
  
  #extract words
  words <- unlist(strsplit(reviews, " "))
@@ -105,59 +81,45 @@ for (i in 1:nrow(reviews.checked)){
  correct_review <- paste(correct_words, collapse = " ")
  
  # update the dataframe
  reviews.checked$review[i] <- correct_review
  reviews.clean$review[i] <- correct_review
}

#reviews.checked$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done

#reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced
# turn all upper case characters into lower case character
reviews.checked$review <- tolower(reviews.checked$review)
# Transform all upper case chracters in a lower case character
reviews.clean$review <- tolower(reviews.clean$review)

# delete reviews with ""
reviews.final <- reviews.checked[reviews.checked$review != "",]
write.csv(reviews.final, file = "reviews.final.csv")
# Setting a working directory to save dataset
setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data")
# Save the final dataframe after all cleaning tasks
write.csv(reviews.clean, file = "reviews.clean.csv")


# word cloud--------------------------------------------------------------------
# combine all reviews into a single text
all_reviews <- paste(reviews.final$review, collapse = "")

# a vectore of all words
all.words <- unlist(strsplit(all_reviews,""))
all.reviews <- paste(reviews.clean$review, collapse = "")


all.words <- character()
# loop through all reviews
for (i in 1:nrow(reviews.final)) {
for (i in 1:nrow(reviews.clean)) {
  #split the review into words
  tmp <- unlist(strsplit(reviews.final$review[i], " "))
  tmp <- unlist(strsplit(reviews.clean$review[i], " "))
  # Concatenate the words to the all.words vector
  all.words <- c(all.words, tmp)
}

# Deleting words with less than 3 letters
all.words <- all.words[nchar(all.words)>=4]
all.words <- all.words[nchar(all.words)>=3]

# Counting the frequency of the words
word.freq <- data.frame(table(all.words))

# creating word count
word.cloud1 <- wordcloud2(data = word.freq, size=1.2,
                          color = "random-light", backgroundColor = "black")

# remove the filling words
all.words.wof <- all.words[!(all.words %in% stopwords("english")) | all.words == "not"]

# dateframe after removing filling words
words.freq <- data.frame(table(all.words.wof))
# word clod 2
wordcloud2(data = words.freq, size=1.2,
wordcloud2(data = word.freq, size=1.2,
                          color = "random-light", backgroundColor = "black")

# Obaining word stems
word.stems <- wordStem(all.words.wof)
#word.stems <- wordStem(all.words.wof)

# Creating a word cloud using word stems
wordcloud2(table(word.stems), size = 1.2, color = "random-light", 
           backgroundColor = "black")
#wordcloud2(table(word.stems), size = 1.2, color = "random-light", 
#           backgroundColor = "black")