created the docker file (eec5ccba) · Commits · Alexander Unger / AIDAHO_IDS_THAS

02_code/R/api/docker.R

0 → 100644

+148 −0

Original line number	Diff line number	Diff line
		# 3b

		rm(list=ls())
		#function 2c

		#works but I have to run the unnest dataframes functions before
		#2c
		download_all_jobs <- function(searchterm, location, radius) {
		library(httr)
		library(dplyr)
		library(jsonlite)

		# Authentication setup (assuming these are constants for all requests)
		headers <- c("Content-Type" = 'application/x-www-form-urlencoded')
		auth_data <- list(
		client_id = 'c003a37f-024f-462a-b36d-b001be4cd24a',
		client_secret = '32a39620-32b3-4307-9aa1-511e3d7f48a8',
		grant_type = 'client_credentials'
		)
		# Fetch access token
		res <- POST(url = 'https://rest.arbeitsagentur.de/oauth/gettoken_cc',
		add_headers(.headers = headers),
		body = auth_data,
		encode = 'form')
		token <- content(res)$access_token

		# API Request setup
		base_url <- "https://rest.arbeitsagentur.de/jobboerse/jobsuche-service/pc/v4/jobs"
		params <- list(
		"was" = searchterm,
		"wo" = location,
		size = 200,
		"umkreis" = radius
		)

		# Initial API Call to get total number of results
		initial_req <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params)
		initial_data <- jsonlite::fromJSON(rawToChar(initial_req$content))
		total_offers <- initial_data$maxErgebnisse
		total_pages <- ceiling(total_offers / 200)


		# Fetch all pages
		all_pages_data <- list()

		# Loop through each page and fetch data
		for (page in 1:total_pages) {
		params$page <- page
		response <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params)
		page_data <- jsonlite::fromJSON(rawToChar(response$content))
		all_pages_data[[page]] <- as.data.frame(page_data$stellenangebote)
		}

		# Combine all pages into one data frame
		job_postings <- bind_rows(all_pages_data)

		unnest_dataframes <- function(x) {
		y <- do.call(data.frame, x)
		if ("data.frame" %in% sapply(y, class)) {
		return(unnest_dataframes(y)) # Recursively call unnest_dataframes if any columns are data frames
		}
		return(y) # Return the flattened data frame
		}

		# Unnest the data frame if needed
		job_postings_unnested <- unnest_dataframes(job_postings)

		return(job_postings_unnested)
		}

		result_df <- download_all_jobs("Daten", "Stuttgart", 100)
		#######################################################################################################
		####

		library(dplyr)
		library(lubridate)
		library(stringr)

		get_active_jobs <- function(subfolder) {
		# Get all relevant files, excluding today's files
		all_files <- list.files(subfolder, full.names = TRUE, pattern = "\\.csv$")
		today <- Sys.Date()
		relevant_files <- all_files[!grepl(as.character(today), all_files)]

		# Find the latest 'full' dataset and subsequent incremental files
		full_files <- relevant_files[grepl("_full.csv", relevant_files)]
		if (length(full_files) == 0) {
		stop("No full dataset available.")
		}
		dates <- sapply(full_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")))
		latest_full_file <- full_files[which.max(dates)]
		latest_full_date <- max(dates)

		# Load the most recent full dataset
		current_jobs <- read.csv(latest_full_file)

		# Apply all subsequent updates
		updates <- relevant_files[grepl("_new.csv\|_old.csv", relevant_files) & sapply(relevant_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")) > latest_full_date)]
		for (file in updates) {
		update_data <- read.csv(file)
		if (grepl("_new.csv", file)) {
		current_jobs <- bind_rows(current_jobs, update_data)
		} else if (grepl("_old.csv", file)) {
		current_jobs <- filter(current_jobs, !refnr %in% update_data$refnr)
		}
		}

		return(current_jobs$refnr) # Return only active job IDs
		}

		# Example usage
		active_ids <- get_active_jobs("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw")
		# This active_ids can now be used as input for conditional_save_downloads

		#######################################################################################################


		conditional_save_downloads <- function(subfolder, jobs, refid_active_jobs = NULL, searchterm) {
		if (!dir.exists(subfolder)) {
		dir.create(subfolder, recursive = TRUE)
		}
		today <- Sys.Date()
		is_monday <- wday(today) == 2
		base_filename <- paste0(subfolder, "/", today, "_jobs_", searchterm)

		if (is_monday \|\| length(list.files(subfolder)) == 0) {
		write.csv(jobs, paste0(base_filename, "_full.csv"), row.names = FALSE)
		} else if (!is.null(refid_active_jobs)) {
		new_jobs <- jobs %>% filter(!refnr %in% refid_active_jobs)
		old_jobs <- jobs %>% filter(refnr %in% refid_active_jobs) %>% select(refnr)

		print(paste("New jobs count:", nrow(new_jobs))) # Debugging log
		print(paste("Old jobs count:", nrow(old_jobs))) # Debugging log

		if (nrow(new_jobs) > 0) {
		write.csv(new_jobs, paste0(base_filename, "_new.csv"), row.names = FALSE)
		}
		if (nrow(old_jobs) > 0) {
		write.csv(old_jobs, paste0(base_filename, "_old.csv"), row.names = FALSE)
		}
		}
		return("Files saved as required based on the conditions.")
		}



		conditional_save_downloads("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw", result_df, active_ids, "daten")