Loading 02_code/R/Script 1 - Data cleaning.R +10 −19 Original line number Diff line number Diff line Loading @@ -32,28 +32,28 @@ load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData" reviews.clean <- gamereviews reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) print(reviews.clean$reviews[4546]) # Updated review reviews.clean$reviews <- gsub("\\b(\\w+)n't", "\\1 not", reviews.clean$reviews) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) print(reviews.clean$reviews[86]) # Updated review reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # Replace consecutive repeated characters with a single character print(reviews.clean$reviews[76]) print(reviews.clean$reviews[76]) # Updated review reviews.clean$reviews <- gsub("\\d", "", reviews.clean$reviews) # Remove digits print(reviews.clean$reviews[10]) print(reviews.clean$reviews[10]) # Updated review reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # Remove punctuation print(reviews.clean$reviews[10]) print(reviews.clean$reviews[10]) # Updated review reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # Replace newline characters with a space print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews Loading @@ -70,13 +70,11 @@ reviews.clean$reviews <- sapply(reviews.clean$reviews, function(review) { cleaned_review <- paste(filtered_words, collapse = " ") return(cleaned_review) }) print(reviews.clean$review[69]) print(reviews.clean$reviews[69]) print(reviews.clean$review[69]) # Original review print(reviews.clean$reviews[69]) # Updated review reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary Loading Loading @@ -133,10 +131,3 @@ word.freq <- data.frame(table(all.words)) # creating word count wordcloud2(data = word.freq, size=1.2, color = "random-light", backgroundColor = "black") # Obaining word stems #word.stems <- wordStem(all.words.wof) # Creating a word cloud using word stems #wordcloud2(table(word.stems), size = 1.2, color = "random-light", # backgroundColor = "black") Loading
02_code/R/Script 1 - Data cleaning.R +10 −19 Original line number Diff line number Diff line Loading @@ -32,28 +32,28 @@ load("D:/Hohenheim/SEM 3/ADS/introads_ass2_team18/01_data/raw/gamereviews.RData" reviews.clean <- gamereviews reviews.clean$reviews <- gsub("[^\\x00-\\x7F]+", "", reviews.clean$review, perl = TRUE) # remove non-ASCII character print(reviews.clean$reviews[4546]) print(reviews.clean$reviews[4546]) # Updated review reviews.clean$reviews <- gsub("\\b(\\w+)n't", "\\1 not", reviews.clean$reviews) #replacing n't with not with an empty space print(reviews.clean$reviews[86]) print(reviews.clean$reviews[86]) # Updated review reviews.clean$reviews <- gsub("http[s]?://\\S+", "", reviews.clean$reviews) # Remove URLs print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- gsub("(.)\\1{2,}", "\\1 ", reviews.clean$reviews) # Replace consecutive repeated characters with a single character print(reviews.clean$reviews[76]) print(reviews.clean$reviews[76]) # Updated review reviews.clean$reviews <- gsub("\\d", "", reviews.clean$reviews) # Remove digits print(reviews.clean$reviews[10]) print(reviews.clean$reviews[10]) # Updated review reviews.clean$reviews <- gsub("[[:punct:]]", "", reviews.clean$reviews) # Remove punctuation print(reviews.clean$reviews[10]) print(reviews.clean$reviews[10]) # Updated review reviews.clean$reviews <- gsub("\\n", " ", reviews.clean$reviews) # Replace newline characters with a space print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- gsub("\\s+", " ", reviews.clean$reviews) # Replace multiple spaces with a single space print(reviews.clean$reviews[2296]) print(reviews.clean$reviews[2296]) # Updated review reviews.clean$reviews <- ifelse(nchar(reviews.clean$reviews) == 1, "", reviews.clean$reviews) # Remove single-character reviews Loading @@ -70,13 +70,11 @@ reviews.clean$reviews <- sapply(reviews.clean$reviews, function(review) { cleaned_review <- paste(filtered_words, collapse = " ") return(cleaned_review) }) print(reviews.clean$review[69]) print(reviews.clean$reviews[69]) print(reviews.clean$review[69]) # Original review print(reviews.clean$reviews[69]) # Updated review reviews.clean$reviews <- trimws(reviews.clean$reviews) # Remove leading and trailing whitespaces # Spell checking of all reviews--------------------------------------------- # Loop through all reviews to check spelling based on US english dictionary Loading Loading @@ -133,10 +131,3 @@ word.freq <- data.frame(table(all.words)) # creating word count wordcloud2(data = word.freq, size=1.2, color = "random-light", backgroundColor = "black") # Obaining word stems #word.stems <- wordStem(all.words.wof) # Creating a word cloud using word stems #wordcloud2(table(word.stems), size = 1.2, color = "random-light", # backgroundColor = "black")