Upload New File (d028ce4e) · Commits · Lovepreet Kapila / IntroADS_ES3

02/R/22.11.25_Collect_Steam_Data_Sources.R

0 → 100644

+114 −0

Original line number	Diff line number	Diff line
		# clear workspace
		rm(list = ls())

		#TASK 2
		# load necessary packages
		library(httr)
		library(jsonlite)
		library(lubridate)

		#Step 1
		#get 100 games data- GET() send a request to the API endpoint- you get raw data in JSON format
		top100.request <- GET(url = "https://steamspy.com/api.php?request=top100forever")

		#Step 2
		#we need to read the response as a string. content(..,as="text") extract the body of response as readable text. UTF-8 ensures special characters are handled corrrectly.
		top100.raw <- content(top100.request, as = "text", encoding = "UTF-8")

		#Step 3
		#Transform JSON into an R object.
		top100.list <- fromJSON(top100.raw)

		#Step 4: Convert list into a data frame
		# Can't make data frame directly as list is nested. Therefore, use lapply() that convert each game entry to small data frame.
		top100.list1 <- lapply(top100.list, as.data.frame)

		#do.call("rbind",..)- stacks them row by row into one big data frame
		top100 <- do.call("rbind", top100.list1)

		#Step 5 Inspect the data
		head(top100) #First 6 rows
		summary (top100)

		#Step 6 Transform price-related variables
		#Prices are in cents and stored as text. We need to convert them to numeric and scale to dollars.
		top100$price <- as.numeric(top100$price)*0.01 #multiply 0.01 changes cents to dollars
		top100$initialprice <- as.numeric(top100$initialprice)*0.01 #as.numerics converts text to numbers
		top100$discount <- as.numeric(top100$discount)

		#Step 7
		gameid <- top100$appid[1]
		game_name <- top100$name[1]


		#TASK 3
		#API endpoint for game details
		details.url <- "http://store.steampowered.com/api/appdetails/?"

		#API expects the game ID in a query list
		query.list <- list(appids=gameid) #gameid is the variable and wrapping it in list() make it compatible with Get().

		#Perform GET request- fetch raw details of the game and response is again in binary.
		details.request <-GET(url=details.url, query=query.list)

		#We need response in string, therefore converting the raw response into readable text.
		details.raw <- content(details.request, as="text", encoding="UTF-8")

		#we transform JSON to R list
		details.list <- fromJSON(details.raw)

		#The useful information is indie the $data element.
		details <- details.list[[1]]$data #details.list[[1]]- select first element of the list. $data- extracts the structured information about the game.

		details$release_date$date
		details$genres
		details$required_age
		details$short_description

		#TASK 4 (Collect user reviews)

		#STEP 1
		rev.url1 <- "https://store.steampowered.com/appreviews/"
		rev.url2 <- "?json=1&filter=recent&num_per_page=100&cursor="
		cursor <- "*"

		#rev.url1- base url for reviews
		#rev.url2- specifies JSON output, filter for recent reviews, limit to 100 reviews, and cursor for pagination.
		#cursor- means start at the beginning.

		#Step 2 (Perform GET request)
		rev.request <- GET(url =paste0(rev.url1, gameid, rev.url2, cursor))
		#paste0()- concatenates strings without spaces i.e. buils full URL

		#Step 3 (Convert binary content to text)
		rev.raw <- content(rev.request, as="text", encoding="UTF-8")

		#Step 4 (JSON to R list)
		rev.list <- fromJSON(rev.raw)

		#Step 5 (Extract reviews into a data frame)
		reviews <- rev.list$reviews

		#Step 6 (Inspect the data)
		names(reviews) #Column names
		summary(reviews) #summary statistics- author data is nested

		#Step 7 (Unnest the author data)
		reviews <- do.call("data.frame", reviews) #expand nested structure into proper columns
		names(reviews)

		#Step 8 (Fix data types)
		# Convert UNIX timestamp to readable date-time
		reviews$timestamp_created <- as_datetime(reviews$timestamp_created)
		reviews$timestamp_created

		#Ensure votes and playtime are numeric
		reviews$votes_up <- as.numeric(reviews$votes_up)
		reviews$votes_up
		reviews$author.num_games_owned <- as.numeric(reviews$author.num_games_owned)
		reviews$author.num_games_owned

		reviews

		reviews <- unique (reviews)
		reviews
		No newline at end of file