Loading 02_code/R/api/docker.R 0 → 100644 +148 −0 Original line number Diff line number Diff line # 3b rm(list=ls()) #function 2c #works but I have to run the unnest dataframes functions before #2c download_all_jobs <- function(searchterm, location, radius) { library(httr) library(dplyr) library(jsonlite) # Authentication setup (assuming these are constants for all requests) headers <- c("Content-Type" = 'application/x-www-form-urlencoded') auth_data <- list( client_id = 'c003a37f-024f-462a-b36d-b001be4cd24a', client_secret = '32a39620-32b3-4307-9aa1-511e3d7f48a8', grant_type = 'client_credentials' ) # Fetch access token res <- POST(url = 'https://rest.arbeitsagentur.de/oauth/gettoken_cc', add_headers(.headers = headers), body = auth_data, encode = 'form') token <- content(res)$access_token # API Request setup base_url <- "https://rest.arbeitsagentur.de/jobboerse/jobsuche-service/pc/v4/jobs" params <- list( "was" = searchterm, "wo" = location, size = 200, "umkreis" = radius ) # Initial API Call to get total number of results initial_req <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params) initial_data <- jsonlite::fromJSON(rawToChar(initial_req$content)) total_offers <- initial_data$maxErgebnisse total_pages <- ceiling(total_offers / 200) # Fetch all pages all_pages_data <- list() # Loop through each page and fetch data for (page in 1:total_pages) { params$page <- page response <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params) page_data <- jsonlite::fromJSON(rawToChar(response$content)) all_pages_data[[page]] <- as.data.frame(page_data$stellenangebote) } # Combine all pages into one data frame job_postings <- bind_rows(all_pages_data) unnest_dataframes <- function(x) { y <- do.call(data.frame, x) if ("data.frame" %in% sapply(y, class)) { return(unnest_dataframes(y)) # Recursively call unnest_dataframes if any columns are data frames } return(y) # Return the flattened data frame } # Unnest the data frame if needed job_postings_unnested <- unnest_dataframes(job_postings) return(job_postings_unnested) } result_df <- download_all_jobs("Daten", "Stuttgart", 100) ####################################################################################################### #### library(dplyr) library(lubridate) library(stringr) get_active_jobs <- function(subfolder) { # Get all relevant files, excluding today's files all_files <- list.files(subfolder, full.names = TRUE, pattern = "\\.csv$") today <- Sys.Date() relevant_files <- all_files[!grepl(as.character(today), all_files)] # Find the latest 'full' dataset and subsequent incremental files full_files <- relevant_files[grepl("_full.csv", relevant_files)] if (length(full_files) == 0) { stop("No full dataset available.") } dates <- sapply(full_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}"))) latest_full_file <- full_files[which.max(dates)] latest_full_date <- max(dates) # Load the most recent full dataset current_jobs <- read.csv(latest_full_file) # Apply all subsequent updates updates <- relevant_files[grepl("_new.csv|_old.csv", relevant_files) & sapply(relevant_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")) > latest_full_date)] for (file in updates) { update_data <- read.csv(file) if (grepl("_new.csv", file)) { current_jobs <- bind_rows(current_jobs, update_data) } else if (grepl("_old.csv", file)) { current_jobs <- filter(current_jobs, !refnr %in% update_data$refnr) } } return(current_jobs$refnr) # Return only active job IDs } # Example usage active_ids <- get_active_jobs("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw") # This active_ids can now be used as input for conditional_save_downloads ####################################################################################################### conditional_save_downloads <- function(subfolder, jobs, refid_active_jobs = NULL, searchterm) { if (!dir.exists(subfolder)) { dir.create(subfolder, recursive = TRUE) } today <- Sys.Date() is_monday <- wday(today) == 2 base_filename <- paste0(subfolder, "/", today, "_jobs_", searchterm) if (is_monday || length(list.files(subfolder)) == 0) { write.csv(jobs, paste0(base_filename, "_full.csv"), row.names = FALSE) } else if (!is.null(refid_active_jobs)) { new_jobs <- jobs %>% filter(!refnr %in% refid_active_jobs) old_jobs <- jobs %>% filter(refnr %in% refid_active_jobs) %>% select(refnr) print(paste("New jobs count:", nrow(new_jobs))) # Debugging log print(paste("Old jobs count:", nrow(old_jobs))) # Debugging log if (nrow(new_jobs) > 0) { write.csv(new_jobs, paste0(base_filename, "_new.csv"), row.names = FALSE) } if (nrow(old_jobs) > 0) { write.csv(old_jobs, paste0(base_filename, "_old.csv"), row.names = FALSE) } } return("Files saved as required based on the conditions.") } conditional_save_downloads("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw", result_df, active_ids, "daten") Loading
02_code/R/api/docker.R 0 → 100644 +148 −0 Original line number Diff line number Diff line # 3b rm(list=ls()) #function 2c #works but I have to run the unnest dataframes functions before #2c download_all_jobs <- function(searchterm, location, radius) { library(httr) library(dplyr) library(jsonlite) # Authentication setup (assuming these are constants for all requests) headers <- c("Content-Type" = 'application/x-www-form-urlencoded') auth_data <- list( client_id = 'c003a37f-024f-462a-b36d-b001be4cd24a', client_secret = '32a39620-32b3-4307-9aa1-511e3d7f48a8', grant_type = 'client_credentials' ) # Fetch access token res <- POST(url = 'https://rest.arbeitsagentur.de/oauth/gettoken_cc', add_headers(.headers = headers), body = auth_data, encode = 'form') token <- content(res)$access_token # API Request setup base_url <- "https://rest.arbeitsagentur.de/jobboerse/jobsuche-service/pc/v4/jobs" params <- list( "was" = searchterm, "wo" = location, size = 200, "umkreis" = radius ) # Initial API Call to get total number of results initial_req <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params) initial_data <- jsonlite::fromJSON(rawToChar(initial_req$content)) total_offers <- initial_data$maxErgebnisse total_pages <- ceiling(total_offers / 200) # Fetch all pages all_pages_data <- list() # Loop through each page and fetch data for (page in 1:total_pages) { params$page <- page response <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params) page_data <- jsonlite::fromJSON(rawToChar(response$content)) all_pages_data[[page]] <- as.data.frame(page_data$stellenangebote) } # Combine all pages into one data frame job_postings <- bind_rows(all_pages_data) unnest_dataframes <- function(x) { y <- do.call(data.frame, x) if ("data.frame" %in% sapply(y, class)) { return(unnest_dataframes(y)) # Recursively call unnest_dataframes if any columns are data frames } return(y) # Return the flattened data frame } # Unnest the data frame if needed job_postings_unnested <- unnest_dataframes(job_postings) return(job_postings_unnested) } result_df <- download_all_jobs("Daten", "Stuttgart", 100) ####################################################################################################### #### library(dplyr) library(lubridate) library(stringr) get_active_jobs <- function(subfolder) { # Get all relevant files, excluding today's files all_files <- list.files(subfolder, full.names = TRUE, pattern = "\\.csv$") today <- Sys.Date() relevant_files <- all_files[!grepl(as.character(today), all_files)] # Find the latest 'full' dataset and subsequent incremental files full_files <- relevant_files[grepl("_full.csv", relevant_files)] if (length(full_files) == 0) { stop("No full dataset available.") } dates <- sapply(full_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}"))) latest_full_file <- full_files[which.max(dates)] latest_full_date <- max(dates) # Load the most recent full dataset current_jobs <- read.csv(latest_full_file) # Apply all subsequent updates updates <- relevant_files[grepl("_new.csv|_old.csv", relevant_files) & sapply(relevant_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")) > latest_full_date)] for (file in updates) { update_data <- read.csv(file) if (grepl("_new.csv", file)) { current_jobs <- bind_rows(current_jobs, update_data) } else if (grepl("_old.csv", file)) { current_jobs <- filter(current_jobs, !refnr %in% update_data$refnr) } } return(current_jobs$refnr) # Return only active job IDs } # Example usage active_ids <- get_active_jobs("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw") # This active_ids can now be used as input for conditional_save_downloads ####################################################################################################### conditional_save_downloads <- function(subfolder, jobs, refid_active_jobs = NULL, searchterm) { if (!dir.exists(subfolder)) { dir.create(subfolder, recursive = TRUE) } today <- Sys.Date() is_monday <- wday(today) == 2 base_filename <- paste0(subfolder, "/", today, "_jobs_", searchterm) if (is_monday || length(list.files(subfolder)) == 0) { write.csv(jobs, paste0(base_filename, "_full.csv"), row.names = FALSE) } else if (!is.null(refid_active_jobs)) { new_jobs <- jobs %>% filter(!refnr %in% refid_active_jobs) old_jobs <- jobs %>% filter(refnr %in% refid_active_jobs) %>% select(refnr) print(paste("New jobs count:", nrow(new_jobs))) # Debugging log print(paste("Old jobs count:", nrow(old_jobs))) # Debugging log if (nrow(new_jobs) > 0) { write.csv(new_jobs, paste0(base_filename, "_new.csv"), row.names = FALSE) } if (nrow(old_jobs) > 0) { write.csv(old_jobs, paste0(base_filename, "_old.csv"), row.names = FALSE) } } return("Files saved as required based on the conditions.") } conditional_save_downloads("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw", result_df, active_ids, "daten")