Loading 02_code/R/Script 1 - Data cleaning.R +32 −27 Original line number Diff line number Diff line Loading @@ -5,13 +5,14 @@ install.packages("hunspell") install.packages("NLP") install.packages("tm") install.packages("wordcloud2") install.packages("SnowballC") install.packages("stringr") library(hunspell) library(NLP) library(tm) library(wordcloud2) library(SnowballC) library(stringr) #loading of ncessary dataset load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") Loading Loading @@ -52,7 +53,7 @@ reviews.clean <- gamereviews reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) reviews.clean$reviews <- gsub("(?<=\\w)n't", " not", reviews.clean$reviews, perl = TRUE) #replacing n't with not with an empty space reviews.clean$reviews <- gsub("\\b(\\w+)n't", "\\1 not", reviews.clean$reviews) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs Loading @@ -73,30 +74,40 @@ print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces print(reviews.clean$reviews[2296]) reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) # Remove stopwords (except "not") using tm package stop_words <- stopwords("english") stop_words <- setdiff(stop_words, "not") reviews.clean$reviews <- sapply(reviews.clean$reviews, function(review) { words <- str_split(review, "\\s+")[[1]] filtered_words <- words[!(words %in% stop_words)] cleaned_review <- paste(filtered_words, collapse = " ") return(cleaned_review) }) print(reviews.clean$review[69]) print(reviews.clean$reviews[69]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces #saving the dataset on a local computer #setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") #write.csv(reviews.clean, file = "reviews.clean.csv") # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary # Create an empty list to store the corrected reviews corrected_reviews <- vector("list", nrow(reviews.clean)) # Create an empty list to store corrected reviews corrected_reviews <- vector("list", length = nrow(reviews.clean)) # Spell check and update reviews for (i in 1:nrow(reviews.clean)) { review <- reviews.clean$reviews[i] #extract words # Extract words words <- unlist(strsplit(review, " ")) # Spell check Loading @@ -105,18 +116,12 @@ for (i in 1:nrow(reviews.clean)){ # Consider only correct words correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review corrected_review <- paste(correct_words, collapse = " ") # Store the corrected review in the list corrected_reviews[[i]] <- corrected_review # Update the original review with correctly spelled words reviews.clean$reviews[i] <- paste(correct_words, collapse = " ") } # Update the dataframe with the corrected reviews reviews.clean$reviews <- unlist(corrected_reviews) # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Setting a working directory to save dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") Loading @@ -139,7 +144,7 @@ for (i in 1:nrow(reviews.clean)) { } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>4] all.words <- all.words[nchar(all.words)>=4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) Loading Loading
02_code/R/Script 1 - Data cleaning.R +32 −27 Original line number Diff line number Diff line Loading @@ -5,13 +5,14 @@ install.packages("hunspell") install.packages("NLP") install.packages("tm") install.packages("wordcloud2") install.packages("SnowballC") install.packages("stringr") library(hunspell) library(NLP) library(tm) library(wordcloud2) library(SnowballC) library(stringr) #loading of ncessary dataset load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData") Loading Loading @@ -52,7 +53,7 @@ reviews.clean <- gamereviews reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) reviews.clean$reviews <- gsub("(?<=\\w)n't", " not", reviews.clean$reviews, perl = TRUE) #replacing n't with not with an empty space reviews.clean$reviews <- gsub("\\b(\\w+)n't", "\\1 not", reviews.clean$reviews) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs Loading @@ -73,30 +74,40 @@ print(reviews.clean$reviews[2296]) reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces print(reviews.clean$reviews[2296]) reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) # Remove stopwords (except "not") using tm package stop_words <- stopwords("english") stop_words <- setdiff(stop_words, "not") reviews.clean$reviews <- sapply(reviews.clean$reviews, function(review) { words <- str_split(review, "\\s+")[[1]] filtered_words <- words[!(words %in% stop_words)] cleaned_review <- paste(filtered_words, collapse = " ") return(cleaned_review) }) print(reviews.clean$review[69]) print(reviews.clean$reviews[69]) reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces #saving the dataset on a local computer #setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") #write.csv(reviews.clean, file = "reviews.clean.csv") # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary # Create an empty list to store the corrected reviews corrected_reviews <- vector("list", nrow(reviews.clean)) # Create an empty list to store corrected reviews corrected_reviews <- vector("list", length = nrow(reviews.clean)) # Spell check and update reviews for (i in 1:nrow(reviews.clean)) { review <- reviews.clean$reviews[i] #extract words # Extract words words <- unlist(strsplit(review, " ")) # Spell check Loading @@ -105,18 +116,12 @@ for (i in 1:nrow(reviews.clean)){ # Consider only correct words correct_words <- words[!words %in% spell_check] # Replacing correctly spelled review with original review corrected_review <- paste(correct_words, collapse = " ") # Store the corrected review in the list corrected_reviews[[i]] <- corrected_review # Update the original review with correctly spelled words reviews.clean$reviews[i] <- paste(correct_words, collapse = " ") } # Update the dataframe with the corrected reviews reviews.clean$reviews <- unlist(corrected_reviews) # Transform all upper case chracters in a lower case character reviews.clean$reviews <- tolower(reviews.clean$reviews) # Exclude observations with empty spaces reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Setting a working directory to save dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") Loading @@ -139,7 +144,7 @@ for (i in 1:nrow(reviews.clean)) { } # Deleting words with less than 3 letters all.words <- all.words[nchar(all.words)>4] all.words <- all.words[nchar(all.words)>=4] # Counting the frequency of the words word.freq <- data.frame(table(all.words)) Loading