Commit 74fedb11 authored by Markus Mößler's avatar Markus Mößler
Browse files

updted solution material

parent 0339ef92
Loading
Loading
Loading
Loading

.gitignore

0 → 100644
+4 −0
Original line number Diff line number Diff line
.Rproj.user
.Rhistory
.RData
.Ruserdata
+83 −0
Original line number Diff line number Diff line
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# 1. PRAEAMBLE ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## a. CLEAN WORKSPACE AND LOAD LIBRARIES ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
options(stringsAsFactors = FALSE)
options(digits.secs=6)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## b. FUNCTIONS AND OTHER USEFUL DEFINITIONS ----
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

`%!in%` <- Negate(`%in%`)
`%ni%` <- Negate(`%in%`)

library("RPostgreSQL")
library("data.table")
library("sqldf")


dsn_database <- "aidaho"   # Specify the name of your Database
# dsn_hostname <- "127.0.0.1"  # localhost = 127.0.0.1
dsn_hostname <- "code-db-1"
# dsn_port <- "8001"                # Specify your port number. e.g. 98939
dsn_port <- "5432"                # Specify your port number. e.g. 98939
#ADMIN USER is: 
dsn_uid <- "admin"
#dsn_uid <- "student"         # Specify your username. e.g. "admin"
# ADMIN PASSWORD is: 
dsn_pwd <- "admin-pw"        
#dsn_pwd <- "aidaho"        # Specify your password. e.g. "xxx"



path <- getwd()
datapath <- paste0(path,"/DATA/")

tryCatch({
    drv <- dbDriver("PostgreSQL")
    print("Connecting to Database…")
    connect <- dbConnect(drv, 
                         dbname = dsn_database,
                         host = dsn_hostname, 
                         port = dsn_port,
                         user = dsn_uid, 
                         password = dsn_pwd)
    print("Database Connected!")
},
error=function(cond) {
    print("Unable to connect to Database.")
}
)

res <- dbSendQuery(connect,"SELECT version();")
dbFetch(res, n = -1)


trade_report <- read.csv(paste0(datapath,"data_feed_20220124_20220124_IEXTP1_DEEP1.0_trade_report.csv"))
setDT(trade_report)
trade_report <- trade_report[-nrow(trade_report),]

dbWriteTable(connect,c('iex','trade_reports'),trade_report,
             overwrite=TRUE,
             row.names=FALSE, 
             field.types = c("ordinal" = "integer", 
                             "timestamp" = "timestamptz",
                             "flags" = "integer",
                             "symbol" = "text",
                             "size" = "integer",
                             "price" = "double precision",
                             "trade_id" = "double precision"))

res <- dbSendQuery(connect,'ALTER TABLE iex.trade_reports ADD CONSTRAINT primary_key PRIMARY KEY (ordinal,timestamp,symbol);')
dbFetch(res, n = -1)



res <- DBI::dbSendQuery(connect, "SELECT * FROM iex.trade_reports LIMIT 10;")
class(res)

(data <- DBI::dbFetch(res, n = -1))
+0 −499
Original line number Diff line number Diff line

# Solution SQL Exercises

## 0. Prepare script

```{r}
rm(list=ls())
options(stringsAsFactors = FALSE)
library("RPostgreSQL")
```

## 1. Connect to database

### Define access credentials

```{r}
dsn_database <- "aidaho"   # Specify the name of your Database
dsn_hostname <- "193.196.53.49"  # localhost = 127.0.0.1
dsn_port <- "8001"                # Specify your port number. e.g. 98939
dsn_uid <- "student"         # Specify your username. e.g. "admin"
dsn_pwd <- "aidaho"        # Specify your password. e.g. "xxx"
```

### Establish connection

```{r}
tryCatch({
    drv <- dbDriver("PostgreSQL")
    print("Connecting to Database…")
    connect <- dbConnect(drv, 
                         dbname = dsn_database,
                         host = dsn_hostname, 
                         port = dsn_port,
                         user = dsn_uid, 
                         password = dsn_pwd)
    print("Database Connected!")
},
error=function(cond) {
    print("Unable to connect to Database.")
}
)
```

### Check connection

```{r}
# Check Connection
res <- dbSendQuery(connect,"SELECT version();")
dbFetch(res, n = -1)
```

## 2. Get an overview over the database

### Queries

```{r}
res <- dbSendQuery(connect," SELECT * FROM iex.trade_reports LIMIT 10;")
dbFetch(res, n = -1)
```

```{r}
res <- dbSendQuery(connect,"SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_schema = 'iex';")
dbFetch(res, n = -1)
```

### Question 1

What do the above queries return?

* The first query returns the 10 first observations from the table `iex.trade_reports`.
* The second query returns the data types of the columns within the table.

### Question 2

What other tables does the `information_schema` contain?

```{r}
res <- dbSendQuery(connect,"SELECT * FROM information_schema.columns 
    WHERE table_schema = 'iex';")
dbFetch(res, n = -1)
```

### Question 3

What information do the columns of `iex.trade_reports` contain?

```{r}
res <- dbSendQuery(connect,"SELECT column_name, data_type 
    FROM information_schema.columns 
   WHERE table_schema = 'iex' AND table_name = 'trade_reports';")
dbFetch(res, n = -1)
```

* `ordinal`: Ordinal number that IDs the timestamp
* `timestamp`: The timestamp of the trade up to 6 digit precision
* `flags`: The trade flag as used by the IEX
* `symbol` The stock ticker
* `size`: The size of the transaction (how many shares have been transacted)
* `price`: The price of the trade
* `trade_id`: Id number the identifies the transaction

### Question 4

Does a primary key exist in the table?

```{r}
res <- dbSendQuery(connect,"SELECT column_name, is_nullable  
    FROM information_schema.columns 
   WHERE table_schema = 'iex' AND table_name = 'trade_reports';")
dbFetch(res, n = -1)
```

* Yes, all columns that have `is_nullable = NO` in `information_schema.columns` belong to the primary key.

## 3. Short Queries

### Query 1

How many distinct symbols does the table contain?

```{r}
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT symbol) from iex.trade_reports;")
dbFetch(res, n = -1)
```

### Query 2

How many different financial instruments (symbols) have been traded between 2022−01−24 10:00:00−05` and `2022−01−24 11:00:00−05`?

```{r}
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT symbol) 
                            FROM iex.trade_reports 
                            WHERE timestamp BETWEEN '2022-01-24 10:00:00-05' AND ' 2022-01-24 11:00:00-05';")
dbFetch(res, n = -1)
```

### Query 3

How many trades of `AAPL` have taken place within the trading hours 10h00 and 11h00?

```{r}
res <- dbSendQuery(connect,"SELECT COUNT(DISTINCT TRADE_ID) 
                            FROM iex.trade_reports 
                            WHERE timestamp BETWEEN '2022-01-24 10:00:00 -5:00:00' AND '2022-01-24 11:00:00 -5:00:00'
                            AND symbol = 'AAPL'")
dbFetch(res, n = -1)
```

### Query 4

Calculate the average price for each symbol in the sample?

```{r}
res <- dbSendQuery(connect,"SELECT symbol,AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol;")
head(dbFetch(res, n = -1), 10)
```

### Query 5

Which symbol has the highest average price?

```{r}
res <- dbSendQuery(connect,"SELECT symbol, AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol 
                            ORDER BY AVG(price) DESC LIMIT 1;;")
dbFetch(res, n = -1)
```

### Query 6

How many symbols have an average price above 1000 USD?

```{r}
res <- dbSendQuery(connect,"SELECT symbol, AVG(price) 
                            FROM iex.trade_reports 
                            GROUP BY symbol 
                            HAVING AVG(price) > 1000;")
dbFetch(res, n = -1)
```

## 4. Last price within 5-minute intervals

### Step 1) Construct inner query

```{r}
ticker <- "MSFT"
interval <- "5"
innerquery <- paste0("SELECT TO_TIMESTAMP(
    FLOOR(
        EXTRACT(epoch FROM timestamp) / 
            EXTRACT(epoch FROM INTERVAL '",interval," min')
        ) * EXTRACT(epoch FROM INTERVAL '",interval," min')
    ) as time_interval,
    * 
        FROM iex.trade_reports 
    WHERE symbol = '",ticker,"'
    ORDER BY timestamp")

# test innerquery
res <- dbSendQuery(connect,paste0(innerquery," LIMIT 10"))
dbFetch(res, n = -1)
```

### Step 2) Construct mezzanine query

```{r}
mezzaninequery <- paste0("SELECT ",
                         "row_number() OVER (PARTITION BY time_interval ORDER BY timestamp DESC) as rownumber, ",
                         "* ",
                         "FROM ",
                         "(",innerquery,") as iq")
# test mezzaninequery
res <- dbSendQuery(connect,paste0(mezzaninequery," LIMIT 10"))
dbFetch(res, n = -1)
```

### Step 3) Counstruct outerquery

```{r}
outerquery <- paste0("SELECT * ",
                     "FROM ",
                     "(",mezzaninequery,") as mq ",
                     "WHERE rownumber=1 ",
                     "ORDER BY time_interval")

# test outerquery (no limit)
res <- dbSendQuery(connect,paste0(outerquery))
dbFetch(res, n = -1)
```

### Step 4) Construct a function for abstraction

```{r}
get_outerquery <- function(interval,ticker){
    innerquery <- paste0("SELECT TO_TIMESTAMP(
    FLOOR(
        EXTRACT(epoch FROM timestamp) / 
            EXTRACT(epoch FROM INTERVAL '",interval," min')
        ) * EXTRACT(epoch FROM INTERVAL '",interval," min')
    ) as time_interval,
    * 
        FROM iex.trade_reports 
    WHERE symbol = '",ticker,"'
    ORDER BY timestamp")

    mezzaninequery <- paste0("SELECT ",
                             "row_number() OVER (PARTITION BY time_interval ORDER BY timestamp DESC) as rownumber, ",
                             "* ",
                             "FROM ",
                             "(",innerquery,") as iq")
    outerquery <- paste0("SELECT * ",
                         "FROM ",
                         "(",mezzaninequery,") as mq ",
                         "WHERE rownumber=1")
    return(outerquery)
}
```

### Step 5) Test function and send query

```{r}
# test outerquery (no limit)
outerquery <- get_outerquery(interval="5",ticker="AAPL")
res <- dbSendQuery(connect,outerquery)
dbFetch(res, n = -1)
```

## 5. Merging

### 5.i Inner join

#### Step 1) Construct outer query for `AAPL` and `MSFT`.

```{r}
oq.AAPL <- get_outerquery(interval="5",ticker="AAPL")
oq.MSFT <- get_outerquery(interval="5",ticker="MSFT")
```

#### Step 2) Construct join statement.

```{r}
join_statement <- paste0("SELECT a.time_interval as ati,
                                   b.time_interval as bti,
                                   a.symbol as symbol_a,
                                   b.symbol as symbol_b,
                                   a.price as price_a,
                                   b.price as price_b 
                            FROM ",
                           "(",oq.AAPL,") as a ",
                           " LEFT JOIN ",
                           "(",oq.MSFT,") as b ",
                           "ON a.time_interval = b.time_interval;")
```

#### Step 3) Send query

```{r}
# test outerquery (no limit)
res <- dbSendQuery(connect,join_statement)
dbFetch(res, n = -1)
```

### 5.ii Left join

#### Step 1) Construct `minmax_time`

```{r}
interval <- "5"

# Get the minimum and maximum time_interval
minmax_time_str <- paste0("SELECT min(time_interval),max(time_interval) from (",oq.AAPL,") as a;")
res <- dbSendQuery(connect,minmax_time_str)
minmax_time <- dbFetch(res, n = -1)
```

...

#### Step 2) Construct `timeseriesquery`

```{r}
timeseriesquery <- paste0("SELECT generate_series('",format(minmax_time[1],tz="UTC"),"'::TIMESTAMP AT TIME ZONE 'UTC','",
                     format(minmax_time[2],tz="UTC"),"'::TIMESTAMPTZ AT TIME ZONE 'UTC','",interval,"m') as time_interval")
res <- dbSendQuery(connect,timeseriesquery)
dbFetch(res, n = -1)
```

...

#### Step 3) Construct `left_join_statement` statement.

Construct a query for a left join between the `timeseriesquery` and the outer query for AAPL containing the time interval, symbol and price.

```{r}
left_join_statement <- paste0("SELECT a.time_interval,
                                   b.symbol,
                                   b.price 
                               FROM ",
                                "(",timeseriesquery,") as a ",
                               " LEFT JOIN ",
                               "(",oq.MSFT,") as b ",
                               "ON a.time_interval = b.time_interval;")

res <- dbSendQuery(connect,left_join_statement)
dbFetch(res, n = -1)
```

...

#### Step 4) Construct a function for abstraction

Construct an *R* function that determines the `minmax_time` variable based on the inputted symbol and interval length and returns the string for the left join query.

```{r}
get_Xmin_prices <- function(interval,ticker){
    oq <- get_outerquery(interval=interval,ticker=ticker)
    # Get the minimum and maximum time_interval
    minmax_time_str <- paste0("SELECT min(time_interval),max(time_interval) from (",oq,") as a;")
    res <- dbSendQuery(connect,minmax_time_str)
    minmax_time <- dbFetch(res, n = -1)
    
    timeseriesquery <- paste0("SELECT generate_series('",format(minmax_time[1],tz="UTC"),"'::TIMESTAMP AT TIME ZONE 'UTC','",
                              format(minmax_time[2],tz="UTC"),"'::TIMESTAMPTZ AT TIME ZONE 'UTC','",interval,"m') as time_interval")
    
    
    left_join_statement <- paste0("SELECT a.time_interval,
                                   b.symbol,
                                   b.price 
                               FROM ",
                                  "(",timeseriesquery,") as a ",
                                  " LEFT JOIN ",
                                  "(",oq,") as b ",
                                  "ON a.time_interval = b.time_interval")
    return(left_join_statement)
}
```

#### Step 5) Test function and send query

Call the function for the symbol GME and an interval length of 1 minute. Submit the resulting query. What do you observe?

```{r}
# test function and send query
test1 <- get_Xmin_prices(interval="1",ticker ="GME")
res <- dbSendQuery(connect,test1)
dbFetch(res, n = -1)
```

## 6. Fill Gaps

### Step 1) Construct first query

```{r}
query1 <- get_Xmin_prices(interval="1",ticker ="GME")
```

### Step 2) Construct the second query

```{r}
query2 <-  paste0("SELECT count(price) OVER (PARTITION BY 1 ORDER BY time_interval) AS count_prices, *
                    FROM (",query1," ) as q1")
```

### Step 3) Construct the final query

```{r}
res_query <- paste0("SELECT count_prices,time_interval,symbol,price, ",
       "first_value(price) OVER part_window AS price_filled ",
       "FROM (",
       query2,
       ") as foo WINDOW part_window AS (PARTITION BY count_prices ORDER BY time_interval)")
```

### Step 4) Send query

```{r}
res <- dbSendQuery(connect,query2)
dbFetch(res, n = -1)
```

### Step 5) Construct a function for abstraction

Write a function that returns a modifed query2 in which the symbol and the price are carried forward. Call the function `get_Xmin_prices_no_gaps`.

```{r}
get_Xmin_prices_no_gaps <- function(interval,ticker){
    query1 <- get_Xmin_prices(interval=interval,ticker =ticker)
    
    query2 <-  paste0("SELECT count(price) OVER (PARTITION BY 1 ORDER BY time_interval) AS count_prices, *
                    FROM (",query1," ) as GME")
    
    res_query <- paste0("SELECT count_prices,time_interval,",
                    "first_value(symbol) OVER part_window AS symbol, ",
                    "first_value(price) OVER part_window AS price ",
                    "FROM (",
                    query2,
                    ") as foo WINDOW part_window AS (PARTITION BY count_prices ORDER BY time_interval)")
    return(res_query)
}
```

### Step 6) Test function and send query

```{r}
# Test function and send query
query <- get_Xmin_prices_no_gaps(interval="1",ticker ="GME")
res <- dbSendQuery(connect,query)
dbFetch(res, n = -1)
```

## 7. Calculate logarithmic first differences (log-returns)

### Step 1) Construct query to query cleaned series

```{r}
clean_query <- get_Xmin_prices_no_gaps(interval="1",ticker ="GME")
```

### Step 2) Construct lagged query to query logarithmic first differences of the price series

```{r}
lagged <- paste0("SELECT *, log(price) - lag(log(price),1) OVER (ORDER BY time_interval) as log_return FROM ",
       "(",clean_query,") as cq;")
```

### Step 3) Send the lagged query.

```{r}
res <- dbSendQuery(connect,lagged)
dbFetch(res, n = -1)
```

### Step 4) Construct a function for abstraction

Write a function that returns the string for a query that based on a ticker symbol and the interval length calculates the log-returns.

```{r}
get_first_differences <- function(interval,ticker){
    
    clean_query <- get_Xmin_prices_no_gaps(interval=interval,ticker =ticker)
    
    lagged <- paste0("SELECT time_interval,symbol, log(price) - lag(log(price),1) OVER (ORDER BY time_interval) as log_return FROM ",
                     "(",clean_query,") as cq;")
    return(lagged)
}
```

### Step 5) Test function and send query

```{r}
# Test function and query
test_lagged <- get_first_differences(interval="10",ticker ="TSLA")
res <- dbSendQuery(connect,test_lagged)
dbFetch(res, n = -1)
```
+0 −2735

File deleted.

Preview size limit exceeded, changes collapsed.

Loading