Loading 02_code/R/Script 1 - Data cleaning.R +1 −28 Original line number Diff line number Diff line Loading @@ -23,33 +23,6 @@ load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData" # New dataframe for data cleaning reviews.clean <- gamereviews #clean.review <- function(review) { # cleaned.review <- gsub("[^[:print:]]", "", review) # remove non-ASCII character # cleaned.review <- gsub("(?<=\\w)n't", " not", review, perl = TRUE) # replacing n't with not with an empty space # cleaned.review <- gsub("http[s]?://\\S+", " ", cleaned.review) # Remove URLs # cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character # cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits # cleaned.review <- gsub("[[:punct:]]", " ", cleaned.review) # Remove punctuation # cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space # cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space # cleaned.review <- ifelse(nchar(cleaned.review) == 1, " ", cleaned.review) # Remove single-character reviews # cleaned.review <- gsub("^\\s+|\\s+$", "", cleaned.review) # Remove leading and trailing whitespaces # Remove stopwords (except "not") using tm package # stop_words <- stopwords("english") # stop_words <- setdiff(stop_words, "not") # # cleaned.review <- paste( # unlist( # lapply(strsplit(cleaned.review, " "), # function(word) ifelse(tolower(word) %in% stop_words, "", word) # ) # ), # collapse = " " # ) # # return(cleaned.review) #} reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) Loading Loading @@ -126,7 +99,7 @@ reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Setting a working directory to save dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") # Save the final dataframe after all cleaning tasks write.csv(reviews.clean, file = "reviews.clean.csv") save(reviews.clean, file = "reviews.clean.RData") # word cloud-------------------------------------------------------------------- Loading Loading
02_code/R/Script 1 - Data cleaning.R +1 −28 Original line number Diff line number Diff line Loading @@ -23,33 +23,6 @@ load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData" # New dataframe for data cleaning reviews.clean <- gamereviews #clean.review <- function(review) { # cleaned.review <- gsub("[^[:print:]]", "", review) # remove non-ASCII character # cleaned.review <- gsub("(?<=\\w)n't", " not", review, perl = TRUE) # replacing n't with not with an empty space # cleaned.review <- gsub("http[s]?://\\S+", " ", cleaned.review) # Remove URLs # cleaned.review <- gsub("(.)\\1{2,}", "\\1 ", cleaned.review) # Replace consecutive repeated characters with a single character # cleaned.review <- gsub("\\d", " ", cleaned.review) # Remove digits # cleaned.review <- gsub("[[:punct:]]", " ", cleaned.review) # Remove punctuation # cleaned.review <- gsub("\\n", " ", cleaned.review) # Replace newline characters with a space # cleaned.review <- gsub("\\s+", " ", cleaned.review) # Replace multiple spaces with a single space # cleaned.review <- ifelse(nchar(cleaned.review) == 1, " ", cleaned.review) # Remove single-character reviews # cleaned.review <- gsub("^\\s+|\\s+$", "", cleaned.review) # Remove leading and trailing whitespaces # Remove stopwords (except "not") using tm package # stop_words <- stopwords("english") # stop_words <- setdiff(stop_words, "not") # # cleaned.review <- paste( # unlist( # lapply(strsplit(cleaned.review, " "), # function(word) ifelse(tolower(word) %in% stop_words, "", word) # ) # ), # collapse = " " # ) # # return(cleaned.review) #} reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) Loading Loading @@ -126,7 +99,7 @@ reviews.clean <- reviews.clean[reviews.clean$reviews != "",] # Setting a working directory to save dataset setwd("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data") # Save the final dataframe after all cleaning tasks write.csv(reviews.clean, file = "reviews.clean.csv") save(reviews.clean, file = "reviews.clean.RData") # word cloud-------------------------------------------------------------------- Loading