Loading 02_code/R/Script 1 - Data cleaning.R +67 −36 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ library(wordcloud2) library(SnowballC) #loading of ncessary dataset load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") # Cleaning of the user reviews-------------------------------------------------- Loading @@ -22,39 +22,64 @@ load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RDat # New dataframe for data cleaning reviews.clean <- gamereviews clean.review <- function(review) { cleaned.review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE) # remove non-ASCII character cleaned.review <- gsub("n't", "not ", cleaned.review) #replacing n't with not with an empty space cleaned.review <- gsub("http[s]?://\\S+", "", cleaned.review) # Remove URLs cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits cleaned.review <- gsub("[[:punct:]]", "", cleaned.review) # Remove punctuation cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space cleaned.review <- trimws(cleaned.review) # Remove leading and trailing whitespaces cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space cleaned.review <- ifelse(nchar(cleaned.review) == 1, "", cleaned.review) # Remove single-character reviews #clean.review <- function(review) { # cleaned.review <- gsub("[^[:print:]]", "", review) # remove non-ASCII character # cleaned.review <- gsub("(?<=\\w)n't", " not", review, perl = TRUE) # replacing n't with not with an empty space # cleaned.review <- gsub("http[s]?://\\S+", " ", cleaned.review) # Remove URLs # cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character # cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits # cleaned.review <- gsub("[[:punct:]]", " ", cleaned.review) # Remove punctuation # cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space # cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space # cleaned.review <- ifelse(nchar(cleaned.review) == 1, " ", cleaned.review) # Remove single-character reviews # cleaned.review <- gsub("^\\s+|\\s+$", "", cleaned.review) # Remove leading and trailing whitespaces # Remove stopwords (except "not") using tm package stop_words <- stopwords("english") stop_words <- setdiff(stop_words, "not") cleaned.review <- paste( unlist( lapply(strsplit(cleaned.review, " "), function(word) ifelse(tolower(word) %in% stop_words, "", word) ) ), collapse = " " ) return(cleaned.review) } # stop_words <- stopwords("english") # stop_words <- setdiff(stop_words, "not") # # cleaned.review <- paste( # unlist( # lapply(strsplit(cleaned.review, " "), # function(word) ifelse(tolower(word) %in% stop_words, "", word) # ) # ), # collapse = " " # ) # # return(cleaned.review) #} reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) reviews.clean$reviews <- gsub("(?<=\\w)n't", " not", reviews.clean$reviews, perl = TRUE) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # Replace consecutive repeated characters with a single character print(reviews.clean$reviews[76]) reviews.clean$reviews <- gsub("\\d", "", reviews.clean$reviews) # Remove digits print(reviews.clean$reviews[10]) # Applying function on each review which gives cleaned review and created a new column to verify the change reviews.clean$reviews <- sapply(reviews.clean$review, clean.review) reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # Remove punctuation print(reviews.clean$reviews[10]) reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # Replace newline characters with a space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces print(reviews.clean$reviews[2296]) reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$review != "",] reviews.clean <- reviews.clean[reviews.clean$reviews != "",] #saving the dataset on a local computer #setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") Loading @@ -65,11 +90,14 @@ reviews.clean <- reviews.clean[reviews.clean$review != "",] # Loop through all reviews to check spelling based on US english dictionary # Create an empty list to store the corrected reviews corrected_reviews <- vector("list", nrow(reviews.clean)) for (i in 1:nrow(reviews.clean)){ reviews <- reviews.clean$reviews[i] review <- reviews.clean$reviews[i] #extract words words <- unlist(strsplit(reviews, " ")) words <- unlist(strsplit(review, " ")) #Spell check spell_check <- hunspell_check(words, dict = "en_US") Loading @@ -78,12 +106,15 @@ for (i in 1:nrow(reviews.clean)){ correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review correct_review <- paste(correct_words, collapse = " ") corrected_review <- paste(correct_words, collapse = " ") # update the dataframe reviews.clean$reviews[i] <- correct_review # Store the corrected review in the list corrected_reviews[[i]] <- corrected_review } # Update the dataframe with the corrected reviews reviews.clean$reviews <- unlist(corrected_reviews) # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) Loading @@ -108,7 +139,7 @@ for (i in 1:nrow(reviews.clean)) { } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>=3] all.words <- all.words[nchar(all.words)>4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) Loading Loading
02_code/R/Script 1 - Data cleaning.R +67 −36 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ library(wordcloud2) library(SnowballC) #loading of ncessary dataset load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") # Cleaning of the user reviews-------------------------------------------------- Loading @@ -22,39 +22,64 @@ load("C:/Users/akluj/SEM_3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RDat # New dataframe for data cleaning reviews.clean <- gamereviews clean.review <- function(review) { cleaned.review <- gsub("[^\\x00-\\x7F]+", "", review, perl = TRUE) # remove non-ASCII character cleaned.review <- gsub("n't", "not ", cleaned.review) #replacing n't with not with an empty space cleaned.review <- gsub("http[s]?://\\S+", "", cleaned.review) # Remove URLs cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits cleaned.review <- gsub("[[:punct:]]", "", cleaned.review) # Remove punctuation cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space cleaned.review <- trimws(cleaned.review) # Remove leading and trailing whitespaces cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space cleaned.review <- ifelse(nchar(cleaned.review) == 1, "", cleaned.review) # Remove single-character reviews #clean.review <- function(review) { # cleaned.review <- gsub("[^[:print:]]", "", review) # remove non-ASCII character # cleaned.review <- gsub("(?<=\\w)n't", " not", review, perl = TRUE) # replacing n't with not with an empty space # cleaned.review <- gsub("http[s]?://\\S+", " ", cleaned.review) # Remove URLs # cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character # cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits # cleaned.review <- gsub("[[:punct:]]", " ", cleaned.review) # Remove punctuation # cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space # cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space # cleaned.review <- ifelse(nchar(cleaned.review) == 1, " ", cleaned.review) # Remove single-character reviews # cleaned.review <- gsub("^\\s+|\\s+$", "", cleaned.review) # Remove leading and trailing whitespaces # Remove stopwords (except "not") using tm package stop_words <- stopwords("english") stop_words <- setdiff(stop_words, "not") cleaned.review <- paste( unlist( lapply(strsplit(cleaned.review, " "), function(word) ifelse(tolower(word) %in% stop_words, "", word) ) ), collapse = " " ) return(cleaned.review) } # stop_words <- stopwords("english") # stop_words <- setdiff(stop_words, "not") # # cleaned.review <- paste( # unlist( # lapply(strsplit(cleaned.review, " "), # function(word) ifelse(tolower(word) %in% stop_words, "", word) # ) # ), # collapse = " " # ) # # return(cleaned.review) #} reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) reviews.clean$reviews <- gsub("(?<=\\w)n't", " not", reviews.clean$reviews, perl = TRUE) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # Replace consecutive repeated characters with a single character print(reviews.clean$reviews[76]) reviews.clean$reviews <- gsub("\\d", "", reviews.clean$reviews) # Remove digits print(reviews.clean$reviews[10]) # Applying function on each review which gives cleaned review and created a new column to verify the change reviews.clean$reviews <- sapply(reviews.clean$review, clean.review) reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # Remove punctuation print(reviews.clean$reviews[10]) reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # Replace newline characters with a space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces print(reviews.clean$reviews[2296]) reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$review != "",] reviews.clean <- reviews.clean[reviews.clean$reviews != "",] #saving the dataset on a local computer #setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") Loading @@ -65,11 +90,14 @@ reviews.clean <- reviews.clean[reviews.clean$review != "",] # Loop through all reviews to check spelling based on US english dictionary # Create an empty list to store the corrected reviews corrected_reviews <- vector("list", nrow(reviews.clean)) for (i in 1:nrow(reviews.clean)){ reviews <- reviews.clean$reviews[i] review <- reviews.clean$reviews[i] #extract words words <- unlist(strsplit(reviews, " ")) words <- unlist(strsplit(review, " ")) #Spell check spell_check <- hunspell_check(words, dict = "en_US") Loading @@ -78,12 +106,15 @@ for (i in 1:nrow(reviews.clean)){ correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review correct_review <- paste(correct_words, collapse = " ") corrected_review <- paste(correct_words, collapse = " ") # update the dataframe reviews.clean$reviews[i] <- correct_review # Store the corrected review in the list corrected_reviews[[i]] <- corrected_review } # Update the dataframe with the corrected reviews reviews.clean$reviews <- unlist(corrected_reviews) # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) Loading @@ -108,7 +139,7 @@ for (i in 1:nrow(reviews.clean)) { } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>=3] all.words <- all.words[nchar(all.words)>4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) Loading