Commit eec5ccba authored by alexung3r's avatar alexung3r
Browse files

created the docker file

parent 862aeedd
Loading
Loading
Loading
Loading

02_code/R/api/docker.R

0 → 100644
+148 −0
Original line number Diff line number Diff line
# 3b 

rm(list=ls())
#function 2c

#works but I have to run the unnest dataframes functions before
#2c
download_all_jobs <- function(searchterm, location, radius) {
  library(httr)
  library(dplyr)
  library(jsonlite)
  
  # Authentication setup (assuming these are constants for all requests)
  headers <- c("Content-Type" = 'application/x-www-form-urlencoded')
  auth_data <- list(
    client_id = 'c003a37f-024f-462a-b36d-b001be4cd24a',
    client_secret = '32a39620-32b3-4307-9aa1-511e3d7f48a8',
    grant_type = 'client_credentials'
  )
  # Fetch access token
  res <- POST(url = 'https://rest.arbeitsagentur.de/oauth/gettoken_cc',
              add_headers(.headers = headers),
              body = auth_data,
              encode = 'form')
  token <- content(res)$access_token
  
  # API Request setup
  base_url <- "https://rest.arbeitsagentur.de/jobboerse/jobsuche-service/pc/v4/jobs"
  params <- list(
    "was" = searchterm,
    "wo" = location,
    size = 200,
    "umkreis" = radius
  )
  
  # Initial API Call to get total number of results
  initial_req <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params)
  initial_data <- jsonlite::fromJSON(rawToChar(initial_req$content))
  total_offers <- initial_data$maxErgebnisse
  total_pages <- ceiling(total_offers / 200)
  
  
  # Fetch all pages
  all_pages_data <- list()
  
  # Loop through each page and fetch data
  for (page in 1:total_pages) {
    params$page <- page
    response <- GET(url = base_url, add_headers(OAuthAccessToken = token, Accept = "application/json"), query = params)
    page_data <- jsonlite::fromJSON(rawToChar(response$content))
    all_pages_data[[page]] <- as.data.frame(page_data$stellenangebote)
  }
  
  # Combine all pages into one data frame
  job_postings <- bind_rows(all_pages_data)
  
  unnest_dataframes <- function(x) {
    y <- do.call(data.frame, x)
    if ("data.frame" %in% sapply(y, class)) {
      return(unnest_dataframes(y))  # Recursively call unnest_dataframes if any columns are data frames
    }
    return(y)  # Return the flattened data frame
  }
  
  # Unnest the data frame if needed
  job_postings_unnested <- unnest_dataframes(job_postings)
  
  return(job_postings_unnested)
}

result_df <- download_all_jobs("Daten", "Stuttgart", 100)
#######################################################################################################
####

library(dplyr)
library(lubridate)
library(stringr)

get_active_jobs <- function(subfolder) {
  # Get all relevant files, excluding today's files
  all_files <- list.files(subfolder, full.names = TRUE, pattern = "\\.csv$")
  today <- Sys.Date()
  relevant_files <- all_files[!grepl(as.character(today), all_files)]
  
  # Find the latest 'full' dataset and subsequent incremental files
  full_files <- relevant_files[grepl("_full.csv", relevant_files)]
  if (length(full_files) == 0) {
    stop("No full dataset available.")
  }
  dates <- sapply(full_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")))
  latest_full_file <- full_files[which.max(dates)]
  latest_full_date <- max(dates)
  
  # Load the most recent full dataset
  current_jobs <- read.csv(latest_full_file)
  
  # Apply all subsequent updates
  updates <- relevant_files[grepl("_new.csv|_old.csv", relevant_files) & sapply(relevant_files, function(x) as.Date(str_extract(x, "\\d{4}-\\d{2}-\\d{2}")) > latest_full_date)]
  for (file in updates) {
    update_data <- read.csv(file)
    if (grepl("_new.csv", file)) {
      current_jobs <- bind_rows(current_jobs, update_data)
    } else if (grepl("_old.csv", file)) {
      current_jobs <- filter(current_jobs, !refnr %in% update_data$refnr)
    }
  }
  
  return(current_jobs$refnr)  # Return only active job IDs
}

# Example usage
active_ids <- get_active_jobs("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw")
# This active_ids can now be used as input for conditional_save_downloads

#######################################################################################################


conditional_save_downloads <- function(subfolder, jobs, refid_active_jobs = NULL, searchterm) {
  if (!dir.exists(subfolder)) {
    dir.create(subfolder, recursive = TRUE)
  }
  today <- Sys.Date()
  is_monday <- wday(today) == 2
  base_filename <- paste0(subfolder, "/", today, "_jobs_", searchterm)
  
  if (is_monday || length(list.files(subfolder)) == 0) {
    write.csv(jobs, paste0(base_filename, "_full.csv"), row.names = FALSE)
  } else if (!is.null(refid_active_jobs)) {
    new_jobs <- jobs %>% filter(!refnr %in% refid_active_jobs)
    old_jobs <- jobs %>% filter(refnr %in% refid_active_jobs) %>% select(refnr)
    
    print(paste("New jobs count:", nrow(new_jobs)))  # Debugging log
    print(paste("Old jobs count:", nrow(old_jobs)))  # Debugging log
    
    if (nrow(new_jobs) > 0) {
      write.csv(new_jobs, paste0(base_filename, "_new.csv"), row.names = FALSE)
    }
    if (nrow(old_jobs) > 0) {
      write.csv(old_jobs, paste0(base_filename, "_old.csv"), row.names = FALSE)
    }
  }
  return("Files saved as required based on the conditions.")
}



conditional_save_downloads("/Users/alexanderunger/Desktop/AIDAHO_IDS_THAS/01_data/raw", result_df, active_ids, "daten")