Loading 02_code/R/Script 1 - Data cleaning.R +152 −3 Original line number Diff line number Diff line rm(list = ls()) # installing packages install.packages("hunspell") install.packages("NLP") install.packages("wordcloud2") install.packages("tm") install.packages("wordcloud2") install.packages("SnowballC") library(hunspell) library(NLP) library(wordcloud2) library(tm) library(wordcloud2) library(SnowballC) #loading of ncessary dataset load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw") # Cleaning of the user reviews--------------------------------------------- game.rev <- read.csv("gamereviews.RData") # New dataframe for data cleaning reviews.clean <- game.rev clean_rev <- function(review) { cleaned_review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE) cleaned_review <- gsub("http[s]?://\\S+", "", cleaned_review) cleaned_review <- gsub("(.)\\1{2,}", "\\1 ", cleaned_review) cleaned_review <- gsub("n't", "not ", cleaned_review) cleaned_review <- gsub("\\d", " ", cleaned_review) cleaned_review <- gsub("[[:punct:]]", "", cleaned_review) cleaned_review <- gsub("\\n", " ", cleaned_review) cleaned_review <- gsub("^\\s+", " ", cleaned_review) cleaned_review <- gsub("\\s+", " ", cleaned_review) cleaned_review <- ifelse(nchar(cleaned_review) == 1, "", cleaned_review) return(cleaned_review) } reviews.clean$reviews <- sapply(reviews.clean$review, clean_rev) #reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$reviews, perl = TRUE) # remove non-ASCII character #reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # remove external URL #reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # remove repeating character #reviews.clean$reviews <- gsub("n't", "not ", reviews.clean$reviews) # replacing n't with not with an empty space #reviews.clean$reviews <- gsub("\\d", " ", reviews.clean$reviews) # remove numerical character #reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # remove punctuation #reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # replace new lines with empty space #reviews.clean$reviews <- gsub("^\\s+", " ", reviews.clean$reviews) # remove unnecessary empty space at the beginning #reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # remove unncessary empty space at the end #reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) reviews.clean$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced #saving the dataset on a local computer setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") write.csv(reviews.clean, file = "reviews.clean.csv") # another dataframe for spelling correction reviews.checked <- reviews.clean # checking on 1st review #review1 <- reviews.checked$review[1] # 5.2 separating 1st review into words #words <- unlist(strsplit(review1, " ")) # 5.3 checking if the words are spelled corectly based on US english dictionary #check.spell <- hunspell_check(words, dict="en_US") #print(check.spell) # 5.4 another vector which has only correct words from review 1 #words_correct <- words[!words %in% check.spell] #print(words_correct) # 5.5 transforming into one string #review1 <- paste(words_correct, collapse = " ") # 5.6 Replacing new review with original review 1 in the dataframe #reviews.checked$review[1] <- review1 #print(reviews.checked$review[1]) # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary for (i in 1:nrow(reviews.checked)){ reviews <- reviews.checked$review[i] #extract words words <- unlist(strsplit(reviews, " ")) #Spell check spell_check <- hunspell_check(words, dict = "en_US") # Consider only correct words correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review correct_review <- paste(correct_words, collapse = " ") # update the dataframe reviews.checked$review[i] <- correct_review } #reviews.checked$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done #reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced # turn all upper case characters into lower case character reviews.checked$review <- tolower(reviews.checked$review) # delete reviews with "" reviews.final <- reviews.checked[reviews.checked$review != "",] write.csv(reviews.final, file = "reviews.final.csv") # word cloud-------------------------------------------------------------------- # combine all reviews into a single text all_reviews <- paste(reviews.final$review, collapse = "") # a vectore of all words all.words <- unlist(strsplit(all_reviews,"")) all.words <- character() # loop through all reviews for (i in 1:nrow(reviews.final)) { #split the review into words tmp <- unlist(strsplit(reviews.final$review[i], " ")) # Concatenate the words to the all.words vector all.words <- c(all.words, tmp) } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>=4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) # creating word count word.cloud1 <- wordcloud2(data = word.freq, size=1.2, color = "random-light", backgroundColor = "black") # remove the filling words all.words.wof <- all.words[!(all.words %in% stopwords("english")) | all.words == "not"] # dateframe after removing filling words words.freq <- data.frame(table(all.words.wof)) # word clod 2 wordcloud2(data = words.freq, size=1.2, color = "random-light", backgroundColor = "black") # Obaining word stems word.stems <- wordStem(all.words.wof) # Creating a word cloud using word stems wordcloud2(table(word.stems), size = 1.2, color = "random-light", backgroundColor = "black") Loading
02_code/R/Script 1 - Data cleaning.R +152 −3 Original line number Diff line number Diff line rm(list = ls()) # installing packages install.packages("hunspell") install.packages("NLP") install.packages("wordcloud2") install.packages("tm") install.packages("wordcloud2") install.packages("SnowballC") library(hunspell) library(NLP) library(wordcloud2) library(tm) library(wordcloud2) library(SnowballC) #loading of ncessary dataset load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load(file = "D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw") # Cleaning of the user reviews--------------------------------------------- game.rev <- read.csv("gamereviews.RData") # New dataframe for data cleaning reviews.clean <- game.rev clean_rev <- function(review) { cleaned_review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE) cleaned_review <- gsub("http[s]?://\\S+", "", cleaned_review) cleaned_review <- gsub("(.)\\1{2,}", "\\1 ", cleaned_review) cleaned_review <- gsub("n't", "not ", cleaned_review) cleaned_review <- gsub("\\d", " ", cleaned_review) cleaned_review <- gsub("[[:punct:]]", "", cleaned_review) cleaned_review <- gsub("\\n", " ", cleaned_review) cleaned_review <- gsub("^\\s+", " ", cleaned_review) cleaned_review <- gsub("\\s+", " ", cleaned_review) cleaned_review <- ifelse(nchar(cleaned_review) == 1, "", cleaned_review) return(cleaned_review) } reviews.clean$reviews <- sapply(reviews.clean$review, clean_rev) #reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$reviews, perl = TRUE) # remove non-ASCII character #reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # remove external URL #reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # remove repeating character #reviews.clean$reviews <- gsub("n't", "not ", reviews.clean$reviews) # replacing n't with not with an empty space #reviews.clean$reviews <- gsub("\\d", " ", reviews.clean$reviews) # remove numerical character #reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # remove punctuation #reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # replace new lines with empty space #reviews.clean$reviews <- gsub("^\\s+", " ", reviews.clean$reviews) # remove unnecessary empty space at the beginning #reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # remove unncessary empty space at the end #reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) reviews.clean$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced #saving the dataset on a local computer setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") write.csv(reviews.clean, file = "reviews.clean.csv") # another dataframe for spelling correction reviews.checked <- reviews.clean # checking on 1st review #review1 <- reviews.checked$review[1] # 5.2 separating 1st review into words #words <- unlist(strsplit(review1, " ")) # 5.3 checking if the words are spelled corectly based on US english dictionary #check.spell <- hunspell_check(words, dict="en_US") #print(check.spell) # 5.4 another vector which has only correct words from review 1 #words_correct <- words[!words %in% check.spell] #print(words_correct) # 5.5 transforming into one string #review1 <- paste(words_correct, collapse = " ") # 5.6 Replacing new review with original review 1 in the dataframe #reviews.checked$review[1] <- review1 #print(reviews.checked$review[1]) # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary for (i in 1:nrow(reviews.checked)){ reviews <- reviews.checked$review[i] #extract words words <- unlist(strsplit(reviews, " ")) #Spell check spell_check <- hunspell_check(words, dict = "en_US") # Consider only correct words correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review correct_review <- paste(correct_words, collapse = " ") # update the dataframe reviews.checked$review[i] <- correct_review } #reviews.checked$review <- reviews.clean$reviews # replacing clean reviews column with raw review column as examination has been done #reviews.clean <- reviews.clean[, -ncol(reviews.clean)] # remove clean review column as it has been replaced # turn all upper case characters into lower case character reviews.checked$review <- tolower(reviews.checked$review) # delete reviews with "" reviews.final <- reviews.checked[reviews.checked$review != "",] write.csv(reviews.final, file = "reviews.final.csv") # word cloud-------------------------------------------------------------------- # combine all reviews into a single text all_reviews <- paste(reviews.final$review, collapse = "") # a vectore of all words all.words <- unlist(strsplit(all_reviews,"")) all.words <- character() # loop through all reviews for (i in 1:nrow(reviews.final)) { #split the review into words tmp <- unlist(strsplit(reviews.final$review[i], " ")) # Concatenate the words to the all.words vector all.words <- c(all.words, tmp) } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>=4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) # creating word count word.cloud1 <- wordcloud2(data = word.freq, size=1.2, color = "random-light", backgroundColor = "black") # remove the filling words all.words.wof <- all.words[!(all.words %in% stopwords("english")) | all.words == "not"] # dateframe after removing filling words words.freq <- data.frame(table(all.words.wof)) # word clod 2 wordcloud2(data = words.freq, size=1.2, color = "random-light", backgroundColor = "black") # Obaining word stems word.stems <- wordStem(all.words.wof) # Creating a word cloud using word stems wordcloud2(table(word.stems), size = 1.2, color = "random-light", backgroundColor = "black")