Commit f418e7c4 authored by Markus Mößler's avatar Markus Mößler
Browse files

updated repository

parent dd966c2e
Loading
Loading
Loading
Loading
+0 −337
Original line number Diff line number Diff line
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# PRAEAMBLE ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## a. CLEAN WORKSPACE AND LOAD LIBRARIES ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
options(stringsAsFactors = FALSE)
library("RPostgreSQL")

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# TASKS ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++




dsn_database <- "aidaho"   # Specify the name of your Database
dsn_hostname <- "193.196.53.49"  # localhost = 127.0.0.1
dsn_port <- "8001"                # Specify your port number. e.g. 98939
dsn_uid <- "student"         # Specify your username. e.g. "admin"
dsn_pwd <- "aidaho"        # Specify your password. e.g. "xxx"


tryCatch({
    drv <- dbDriver("PostgreSQL")
    print("Connecting to Database…")
    connect <- dbConnect(drv, 
                         dbname = dsn_database,
                         host = dsn_hostname, 
                         port = dsn_port,
                         user = dsn_uid, 
                         password = dsn_pwd)
    print("Database Connected!")
},
error=function(cond) {
    print("Unable to connect to Database.")
}
)

# Check Connection
res <- dbSendQuery(connect,"SELECT version();")
dbFetch(res, n = -1)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## 2. Get an overview over the database ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


res <- dbSendQuery(connect," SELECT * FROM iex.trade_reports LIMIT 10;")
dbFetch(res, n = -1)

res <- dbSendQuery(connect,"SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'iex';")
dbFetch(res, n = -1)


#+ What do the above queries return?
#+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#+ The first query returns the 10 first observations from the table iex.trade_reports.
#+ The second query returns the data types of the columns within the table.

#+ What other tables does the \lstinline[style=Sql]{information_schema} contain?
#+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#+ Well let's see:
res <- dbSendQuery(connect,"SELECT * FROM information_schema.columns 
    WHERE table_schema = 'iex';")
dbFetch(res, n = -1)

#+ What information do the columns of iex.trade_reports contain?
#+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#+ ordinal - ordinal number that IDs the timestamp
#+ timestamp - the timestamp of the trade up to 6 digit precision
#+ flags - the trade flag as used by the IEX
#+ symbol - the stock ticker
#+ size - the size of the transaction (how many shares have been transacted)
#+ price - the price of the trade
#+ trade_id - id number the identifies the transaction

#+ Does a primary key exist in the table?
#+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#+ Yes, all columns that have is_nullable = NO in information_schema.columns belong to the primary key.


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## 3. Short Queries ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+ How many distinct symbols does the table contain?
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT symbol) from iex.trade_reports;")
dbFetch(res, n = -1)
#+How many different financial instruments (symbols) have been traded 
#+BETWEEN 2022-01-24 10:00:00-05 AND  2022-01-24 11:00:00-05?
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT symbol) 
                            FROM iex.trade_reports 
                            WHERE timestamp BETWEEN '2022-01-24 10:00:00-05' AND ' 2022-01-24 11:00:00-05';")
dbFetch(res, n = -1)

#+How many trades of \texttt{AAPL} have taken place within the trading hours 10h00 and 11h00?
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT TRADE_ID) 
                            FROM iex.trade_reports 
                            WHERE timestamp BETWEEN '2022-01-24 10:00:00 -5:00:00' AND '2022-01-24 11:00:00 -5:00:00'
                            AND symbol = 'AAPL'")
dbFetch(res, n = -1)

#+Calculate the average price for each symbol in the sample?
res <- dbSendQuery(connect,"SELECT symbol,AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol;")
dbFetch(res, n = -1)

#+Which symbol has the highest average price?
res <- dbSendQuery(connect,"SELECT symbol, AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol 
                            ORDER BY AVG(price) DESC LIMIT 1;;")
dbFetch(res, n = -1)

#+How many symbols have an average price above 1000 USD?
res <- dbSendQuery(connect,"SELECT symbol, AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol 
                            HAVING AVG(price) > 1000;")
dbFetch(res, n = -1)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## 4. Last price within a 5 minute interval ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ticker <- "MSFT"
interval <- "5"
innerquery <- paste0("SELECT TO_TIMESTAMP(
    FLOOR(
        EXTRACT(epoch FROM timestamp) / 
            EXTRACT(epoch FROM INTERVAL '",interval," min')
        ) * EXTRACT(epoch FROM INTERVAL '",interval," min')
    ) as time_interval,
    * 
        FROM iex.trade_reports 
    WHERE symbol = '",ticker,"'
    ORDER BY timestamp")

# test innerquery
res <- dbSendQuery(connect,paste0(innerquery," LIMIT 10"))
dbFetch(res, n = -1)

mezzaninequery <- paste0("SELECT ",
                         "row_number() OVER (PARTITION BY time_interval ORDER BY timestamp DESC) as rownumber, ",
                         "* ",
                         "FROM ",
                         "(",innerquery,") as iq")
# test mezzaninequery
res <- dbSendQuery(connect,paste0(mezzaninequery," LIMIT 10"))
dbFetch(res, n = -1)

outerquery <- paste0("SELECT * ",
                     "FROM ",
                     "(",mezzaninequery,") as mq ",
                     "WHERE rownumber=1 ",
                     "ORDER BY time_interval")
# test outerquery (no limit)
res <- dbSendQuery(connect,paste0(outerquery))
dbFetch(res, n = -1)


# Construct a function
get_outerquery <- function(interval,ticker){
    innerquery <- paste0("SELECT TO_TIMESTAMP(
    FLOOR(
        EXTRACT(epoch FROM timestamp) / 
            EXTRACT(epoch FROM INTERVAL '",interval," min')
        ) * EXTRACT(epoch FROM INTERVAL '",interval," min')
    ) as time_interval,
    * 
        FROM iex.trade_reports 
    WHERE symbol = '",ticker,"'
    ORDER BY timestamp")

    mezzaninequery <- paste0("SELECT ",
                             "row_number() OVER (PARTITION BY time_interval ORDER BY timestamp DESC) as rownumber, ",
                             "* ",
                             "FROM ",
                             "(",innerquery,") as iq")
    outerquery <- paste0("SELECT * ",
                         "FROM ",
                         "(",mezzaninequery,") as mq ",
                         "WHERE rownumber=1")
    return(outerquery)
}

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## 5. MERGING ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### i. INNER JOIN
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

oq.AAPL <- get_outerquery(interval="5",ticker="AAPL")
oq.MSFT <- get_outerquery(interval="5",ticker="MSFT")


join_statement <- paste0("SELECT a.time_interval as ati,
                                   b.time_interval as bti,
                                   a.symbol as symbol_a,
                                   b.symbol as symbol_b,
                                   a.price as price_a,
                                   b.price as price_b 
                            FROM ",
                           "(",oq.AAPL,") as a ",
                           " LEFT JOIN ",
                           "(",oq.MSFT,") as b ",
                           "ON a.time_interval = b.time_interval;")

# test outerquery (no limit)
res <- dbSendQuery(connect,join_statement)
dbFetch(res, n = -1)


### ii. LEFT JOIN
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
interval <- "5"

# Get the minimum and maximum time_interval
minmax_time_str <- paste0("SELECT min(time_interval),max(time_interval) from (",oq.AAPL,") as a;")
res <- dbSendQuery(connect,minmax_time_str)
minmax_time <- dbFetch(res, n = -1)

timeseriesquery <- paste0("SELECT generate_series('",format(minmax_time[1],tz="UTC"),"'::TIMESTAMP AT TIME ZONE 'UTC','",
                     format(minmax_time[2],tz="UTC"),"'::TIMESTAMPTZ AT TIME ZONE 'UTC','",interval,"m') as time_interval")
res <- dbSendQuery(connect,timeseriesquery)
dbFetch(res, n = -1)


left_join_statement <- paste0("SELECT a.time_interval,
                                   b.symbol,
                                   b.price 
                               FROM ",
                                "(",timeseriesquery,") as a ",
                               " LEFT JOIN ",
                               "(",oq.MSFT,") as b ",
                               "ON a.time_interval = b.time_interval;")

res <- dbSendQuery(connect,left_join_statement)
dbFetch(res, n = -1)


get_Xmin_prices <- function(interval,ticker){
    oq <- get_outerquery(interval=interval,ticker=ticker)
    # Get the minimum and maximum time_interval
    minmax_time_str <- paste0("SELECT min(time_interval),max(time_interval) from (",oq,") as a;")
    res <- dbSendQuery(connect,minmax_time_str)
    minmax_time <- dbFetch(res, n = -1)
    
    timeseriesquery <- paste0("SELECT generate_series('",format(minmax_time[1],tz="UTC"),"'::TIMESTAMP AT TIME ZONE 'UTC','",
                              format(minmax_time[2],tz="UTC"),"'::TIMESTAMPTZ AT TIME ZONE 'UTC','",interval,"m') as time_interval")
    
    
    left_join_statement <- paste0("SELECT a.time_interval,
                                   b.symbol,
                                   b.price 
                               FROM ",
                                  "(",timeseriesquery,") as a ",
                                  " LEFT JOIN ",
                                  "(",oq,") as b ",
                                  "ON a.time_interval = b.time_interval")
    return(left_join_statement)
}


test1 <- get_Xmin_prices(interval="1",ticker ="GME")

res <- dbSendQuery(connect,test1)
dbFetch(res, n = -1)

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## 6. WRITE INFORMATION FORWARD ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

query0 <- get_Xmin_prices(interval="1",ticker ="GME")

query1 <-  paste0("SELECT count(price) OVER (PARTITION BY 1 ORDER BY time_interval) AS count_prices, *
                    FROM (",query0," ) as q0")

query2 <- paste0("SELECT count_prices,time_interval,symbol,price, ",
       "first_value(price) OVER part_window AS price_filled ",
       "FROM (",
       query1,
       ") as foo WINDOW part_window AS (PARTITION BY count_prices ORDER BY time_interval)")
res <- dbSendQuery(connect,query2)
dbFetch(res, n = -1)

get_Xmin_prices_no_gaps <- function(interval,ticker){
    query1 <- get_Xmin_prices(interval=interval,ticker =ticker)
    
    query2 <-  paste0("SELECT count(price) OVER (PARTITION BY 1 ORDER BY time_interval) AS count_prices, *
                    FROM (",query1," ) as GME")
    
    res_query <- paste0("SELECT count_prices,time_interval,",
                    "first_value(symbol) OVER part_window AS symbol, ",
                    "first_value(price) OVER part_window AS price ",
                    "FROM (",
                    query2,
                    ") as foo WINDOW part_window AS (PARTITION BY count_prices ORDER BY time_interval)")
    return(res_query)
}


query <- get_Xmin_prices_no_gaps(interval="1",ticker ="GME")
res <- dbSendQuery(connect,query)
dbFetch(res, n = -1)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# 7. CALCULATE LOGARITHMIC FIRST DIFFERENCES (Log-returns) ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

clean_query <- get_Xmin_prices_no_gaps(interval="1",ticker ="GME")

lagged <- paste0("SELECT *, log(price) - lag(log(price),1) OVER (ORDER BY time_interval) as log_return FROM ",
       "(",clean_query,") as cq;")

res <- dbSendQuery(connect,lagged)
dbFetch(res, n = -1)


get_first_differences <- function(interval,ticker){
    
    clean_query <- get_Xmin_prices_no_gaps(interval=interval,ticker =ticker)
    
    lagged <- paste0("SELECT time_interval,symbol, log(price) - lag(log(price),1) OVER (ORDER BY time_interval) as log_return FROM ",
                     "(",clean_query,") as cq;")
    return(lagged)
}

test_lagged <- get_first_differences(interval="10",ticker ="TSLA")
res <- dbSendQuery(connect,test_lagged)
dbFetch(res, n = -1)