Survival Analysis Exercises

In this document we will conduct survival analysis on a data set relating employee turnover to several variables that were measured at the time an employee was hired.

Exercise 1 - Load, explore, and structure the survival data

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(survival)
library(survminer)

## Loading required package: ggplot2

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# Load the survival analysis data file from the url provided
url <- "https://rstudio-conf-2022.github.io/people-analytics-rstats/data/survival_analysis_data.csv"
surv_data <- read.csv(url)

# view the first few rows in the data set to acquaint yourself with the data
head(surv_data)

##      EID REFERRAL NUMBER_JOBS PROG_EXER_RATE INTPER_RATE PRIOR_EXP HIRE_REC
## 1 453921       No           2           Pass           6       Yes        2
## 2 732470       No          3+           Pass          10        No        2
## 3 169619       No          3+       Concerns          10        No        1
## 4 953421       No         0-1       Concerns           6        No        2
## 5 363973       No           2           Pass          10       Yes        3
## 6 779914       No         0-1           Pass           7        No        2
##    HIRE_DATE TURNOVER_DATE TURNOVER
## 1 2017-02-01          <NA>       No
## 2 2016-02-01    2017-12-01      Yes
## 3 2016-02-01          <NA>       No
## 4 2017-02-01    2017-07-01      Yes
## 5 2016-02-01          <NA>       No
## 6 2017-01-01    2021-01-01      Yes

# convert columns ending in 'DATE' to date format
surv_data <- surv_data |> 
  dplyr::mutate(across(ends_with("DATE"), as.Date))

# Create a censor indicator variable that is 1 if the employee has left and 0 otherwise
surv_data <- 
  surv_data |> 
  dplyr::mutate(
    CENSOR = dplyr::case_when(
      TURNOVER == "Yes" ~ 1,
      TRUE ~ 0
    )
  )

# Replace missing TURNOVER_DATE values with "2022-07-01" -- think about what missing TURNOVER_DATES mean
surv_data <- 
  surv_data |>
  dplyr::mutate(
    TURNOVER_DATE = dplyr::case_when(
      is.na(TURNOVER_DATE) ~ as.Date("2022-07-01"),
      TRUE ~ TURNOVER_DATE
    )
  )

# Use the lubridate::interval() function to create the EVENT_TIME variable 
# Hint: Use  %/% months(1) to transform the time difference into months 
surv_data <- 
  surv_data |>
  dplyr::mutate(
    EVENT_TIME = lubridate::interval(HIRE_DATE, TURNOVER_DATE) %/% months(1)
  )

# Calculate some descriptive statistics for EVENT_TIME using the full sample, then group the data by CENSOR to see how the descriptives
# change. 
mean(surv_data$EVENT_TIME)

## [1] 43.20513

median(surv_data$EVENT_TIME)

## [1] 47

surv_data |>
  dplyr::group_by(
    CENSOR
  ) |>
  dplyr::summarise(
    MEAN_EVENT_TIME = mean(EVENT_TIME),
    MEDIAN_EVENT_TIME = median(EVENT_TIME),
    SD_EVENT_TIME = sd(EVENT_TIME)
  )

## # A tibble: 2 x 4
##   CENSOR MEAN_EVENT_TIME MEDIAN_EVENT_TIME SD_EVENT_TIME
##    <dbl>           <dbl>             <dbl>         <dbl>
## 1      0            69.9                66          6.54
## 2      1            28.0                27         17.6

Exercise 2 - Create a survival object and estimate survival functions using Kaplan Meier estimates

# Create a survival object using survival::Surv().
# Remember it requires two arguments EVENT_TIME and CENSOR
surv_object <- survival::Surv(
  event = surv_data$CENSOR,
  time = surv_data$EVENT_TIME
)

# Use survival::survfit() to estimate survival probabilities using the Kaplan Meier estimator for the entire sample
# Save the survfit output into an object called km_all

# How many months after hiring would you expect 50% of the sample to remain at the firm? (e.g. the median survival time)
# How many months after hiring would you expect 75% of the sample to remain at the firm?

km_all <- survival::survfit(surv_object ~ 1, data = surv_data)
summary(km_all)

## Call: survfit(formula = surv_object ~ 1, data = surv_data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   8000     182    0.977 0.00167        0.974        0.981
##     2   7818     181    0.955 0.00233        0.950        0.959
##     3   7637     187    0.931 0.00283        0.926        0.937
##     4   7450     135    0.914 0.00313        0.908        0.921
##     5   7315     128    0.898 0.00338        0.892        0.905
##     6   7187     110    0.885 0.00357        0.878        0.892
##     7   7077      87    0.874 0.00371        0.867        0.881
##     8   6990      68    0.865 0.00382        0.858        0.873
##     9   6922      41    0.860 0.00388        0.853        0.868
##    10   6881      58    0.853 0.00396        0.845        0.861
##    11   6823      36    0.848 0.00401        0.841        0.856
##    12   6787      35    0.844 0.00406        0.836        0.852
##    13   6752      27    0.841 0.00409        0.833        0.849
##    14   6725      29    0.837 0.00413        0.829        0.845
##    15   6696      38    0.832 0.00418        0.824        0.840
##    16   6658      41    0.827 0.00423        0.819        0.835
##    17   6617      36    0.823 0.00427        0.814        0.831
##    18   6581      46    0.817 0.00432        0.808        0.825
##    19   6535      57    0.810 0.00439        0.801        0.818
##    20   6478      69    0.801 0.00446        0.792        0.810
##    21   6409      93    0.789 0.00456        0.781        0.798
##    22   6316     117    0.775 0.00467        0.766        0.784
##    23   6199     134    0.758 0.00479        0.749        0.768
##    24   6065     124    0.743 0.00489        0.733        0.752
##    25   5941     163    0.722 0.00501        0.713        0.732
##    26   5778     218    0.695 0.00515        0.685        0.705
##    27   5560     197    0.670 0.00526        0.660        0.681
##    28   5363     199    0.645 0.00535        0.635        0.656
##    29   5164     178    0.623 0.00542        0.613        0.634
##    30   4986     160    0.603 0.00547        0.593        0.614
##    31   4826     117    0.589 0.00550        0.578        0.600
##    32   4709      94    0.577 0.00552        0.566        0.588
##    33   4615      76    0.567 0.00554        0.557        0.578
##    34   4539      51    0.561 0.00555        0.550        0.572
##    35   4488      32    0.557 0.00555        0.546        0.568
##    36   4456      15    0.555 0.00556        0.544        0.566
##    37   4441      12    0.554 0.00556        0.543        0.565
##    38   4429      16    0.552 0.00556        0.541        0.563
##    39   4413      15    0.550 0.00556        0.539        0.561
##    40   4398      26    0.547 0.00557        0.536        0.558
##    41   4372      47    0.541 0.00557        0.530        0.552
##    42   4325      55    0.534 0.00558        0.523        0.545
##    43   4270      59    0.526 0.00558        0.516        0.537
##    44   4211      59    0.519 0.00559        0.508        0.530
##    45   4152      85    0.508 0.00559        0.498        0.519
##    46   4067      64    0.500 0.00559        0.490        0.511
##    47   4003      83    0.490 0.00559        0.479        0.501
##    48   3920     100    0.478 0.00558        0.467        0.489
##    49   3820     122    0.462 0.00557        0.451        0.473
##    50   3698      97    0.450 0.00556        0.439        0.461
##    51   3601      90    0.439 0.00555        0.428        0.450
##    52   3511      91    0.428 0.00553        0.417        0.438
##    53   3420      88    0.417 0.00551        0.406        0.427
##    54   3332      88    0.406 0.00549        0.395        0.416
##    55   3244      68    0.397 0.00547        0.386        0.408
##    56   3152      57    0.390 0.00545        0.379        0.401
##    57   3088      41    0.385 0.00544        0.374        0.395
##    58   3040      39    0.380 0.00543        0.369        0.390
##    59   2951      39    0.375 0.00542        0.364        0.385
##    60   2868      18    0.372 0.00541        0.362        0.383
##    61   2813      11    0.371 0.00541        0.360        0.382
##    62   2726       2    0.371 0.00541        0.360        0.381
##    63   2645       3    0.370 0.00540        0.360        0.381
##    64   2496       3    0.370 0.00540        0.359        0.380
##    65   2353       1    0.370 0.00540        0.359        0.380
##    67   1502       2    0.369 0.00541        0.359        0.380
##    68   1485       9    0.367 0.00543        0.356        0.378
##    69   1461       9    0.365 0.00545        0.354        0.375
##    70   1445      10    0.362 0.00547        0.352        0.373
##    71   1403      13    0.359 0.00549        0.348        0.370
##    72   1348       9    0.356 0.00551        0.346        0.367

# Look at the structure of km_all using str()
str(km_all)

## List of 16
##  $ n        : int 8000
##  $ time     : num [1:78] 1 2 3 4 5 6 7 8 9 10 ...
##  $ n.risk   : num [1:78] 8000 7818 7637 7450 7315 ...
##  $ n.event  : num [1:78] 182 181 187 135 128 110 87 68 41 58 ...
##  $ n.censor : num [1:78] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surv     : num [1:78] 0.977 0.955 0.931 0.914 0.898 ...
##  $ std.err  : num [1:78] 0.00171 0.00244 0.00304 0.00342 0.00376 ...
##  $ cumhaz   : num [1:78] 0.0227 0.0459 0.0704 0.0885 0.106 ...
##  $ std.chaz : num [1:78] 0.00169 0.00241 0.003 0.00338 0.00372 ...
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:78] 0.974 0.95 0.926 0.908 0.892 ...
##  $ upper    : num [1:78] 0.981 0.959 0.937 0.921 0.905 ...
##  $ call     : language survfit(formula = surv_object ~ 1, data = surv_data)
##  - attr(*, "class")= chr "survfit"

# Using km_all, create a new data frame called km_calc which contains the following variables from km_all:
# time, n.risk, n.censor, n.event, and surv 

km_calc <- 
  tibble::tibble(
    time = km_all$time,
    n.risk = km_all$n.risk,
    n.censor = km_all$n.censor,
    n.event = km_all$n.event,
    survival = km_all$surv
  )

# Using km_calc, determine the following:
# How many turnovers occurred in the dataset?
sum(km_calc$n.event)

## [1] 5090

# How many censored observations are in the dataset?
sum(km_calc$n.censor)

## [1] 2910

# At what time point do the first censored observations appear? 
min(which(km_calc$n.censor != 0))

## [1] 55

# Subtract n.risk at time point 2 from n.risk at time point 1.
km_calc$n.risk[1] - km_calc$n.risk[2]

## [1] 182

# Why did n.risk decrease from time point 1 to time point 2?
# n.risk at time point 2 = n.risk at time point 1 - n.events at time point 1 - n.censor at time point 1

# Subtract n.risk at time point 56 from time point 55.
km_calc$n.risk[55] - km_calc$n.risk[56]

## [1] 92

# Why did n.risk decrease from time point 55 to time point 56?
# n.risk at time point 56 = n.risk at time point 55 - n.events at time point 55 - n.censor at time point 55

# The formula to calculate the survival probabilities using the KM estimator is: 
# survival(t - 1) * (1 - (n.event[t] / n.risk[t])) 
# That is, the survival probability at time t is equal to the survival probability at time t-1 multiplied by 1 - (number of events at 
# time t / number of individuals at risk at time t).

# Using the formula above, manual calculate the survival probabilities and save them as variable in km_calc as surv_calc. Take a moment to think about how censored observations are used in the calculation as well.
km_calc <- 
  km_calc |>
  dplyr::mutate(
    surv_calc_1 = (1 - n.event / n.risk),
    surv_calc = cumprod(surv_calc_1)
  )

Median: 47 months 75: 24 months

# Use survminer::ggsurvplot() to plot the overall survival function.
survminer::ggsurvplot(fit = km_all)

# Use survival::survfit() to estimate survival probabilities by the REFERRAL variable. 
# For the portion of the sample that was referred for the job (REFERRAL == "Yes"), how many months after hiring would you expect 50% of the sample to remain at the firm? What about for the portion of the sample that was not referred? 
# Why would the median survival time be missing? 

km_ref <- survival::survfit(surv_object ~ REFERRAL, data = surv_data)
summary(km_ref)

## Call: survfit(formula = surv_object ~ REFERRAL, data = surv_data)
## 
##                 REFERRAL=No 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   7230     178    0.975 0.00182        0.972        0.979
##     2   7052     171    0.952 0.00252        0.947        0.957
##     3   6881     180    0.927 0.00306        0.921        0.933
##     4   6701     131    0.909 0.00339        0.902        0.915
##     5   6570     122    0.892 0.00365        0.885        0.899
##     6   6448     105    0.877 0.00386        0.870        0.885
##     7   6343      86    0.865 0.00401        0.858        0.873
##     8   6257      65    0.856 0.00412        0.848        0.865
##     9   6192      40    0.851 0.00419        0.843        0.859
##    10   6152      55    0.843 0.00428        0.835        0.852
##    11   6097      32    0.839 0.00432        0.830        0.847
##    12   6065      32    0.834 0.00437        0.826        0.843
##    13   6033      27    0.831 0.00441        0.822        0.839
##    14   6006      28    0.827 0.00445        0.818        0.836
##    15   5978      34    0.822 0.00450        0.813        0.831
##    16   5944      39    0.817 0.00455        0.808        0.826
##    17   5905      35    0.812 0.00460        0.803        0.821
##    18   5870      42    0.806 0.00465        0.797        0.815
##    19   5828      55    0.798 0.00472        0.789        0.808
##    20   5773      67    0.789 0.00480        0.780        0.799
##    21   5706      87    0.777 0.00489        0.768        0.787
##    22   5619     110    0.762 0.00501        0.752        0.772
##    23   5509     129    0.744 0.00513        0.734        0.754
##    24   5380     116    0.728 0.00523        0.718        0.738
##    25   5264     157    0.706 0.00536        0.696        0.717
##    26   5107     210    0.677 0.00550        0.667        0.688
##    27   4897     191    0.651 0.00561        0.640        0.662
##    28   4706     192    0.624 0.00570        0.613        0.636
##    29   4514     170    0.601 0.00576        0.590        0.612
##    30   4344     148    0.580 0.00580        0.569        0.592
##    31   4196     111    0.565 0.00583        0.554        0.577
##    32   4085      86    0.553 0.00585        0.542        0.565
##    33   3999      71    0.543 0.00586        0.532        0.555
##    34   3928      49    0.537 0.00586        0.525        0.548
##    35   3879      30    0.532 0.00587        0.521        0.544
##    36   3849      14    0.530 0.00587        0.519        0.542
##    37   3835      12    0.529 0.00587        0.517        0.540
##    38   3823      15    0.527 0.00587        0.515        0.538
##    39   3808      14    0.525 0.00587        0.513        0.536
##    40   3794      24    0.521 0.00587        0.510        0.533
##    41   3770      45    0.515 0.00588        0.504        0.527
##    42   3725      48    0.509 0.00588        0.497        0.520
##    43   3677      55    0.501 0.00588        0.490        0.513
##    44   3622      54    0.493 0.00588        0.482        0.505
##    45   3568      83    0.482 0.00588        0.471        0.494
##    46   3485      59    0.474 0.00587        0.462        0.486
##    47   3426      77    0.463 0.00586        0.452        0.475
##    48   3349      93    0.450 0.00585        0.439        0.462
##    49   3256     115    0.434 0.00583        0.423        0.446
##    50   3141      89    0.422 0.00581        0.411        0.434
##    51   3052      86    0.410 0.00578        0.399        0.422
##    52   2966      84    0.399 0.00576        0.387        0.410
##    53   2882      81    0.387 0.00573        0.376        0.399
##    54   2801      86    0.376 0.00570        0.365        0.387
##    55   2715      59    0.367 0.00567        0.356        0.379
##    56   2638      53    0.360 0.00565        0.349        0.371
##    57   2579      39    0.355 0.00563        0.344        0.366
##    58   2534      35    0.350 0.00561        0.339        0.361
##    59   2451      36    0.345 0.00559        0.334        0.356
##    60   2376      16    0.342 0.00558        0.331        0.353
##    61   2329       9    0.341 0.00558        0.330        0.352
##    62   2256       2    0.341 0.00558        0.330        0.352
##    63   2183       3    0.340 0.00558        0.329        0.351
##    64   2058       3    0.340 0.00558        0.329        0.351
##    65   1942       1    0.339 0.00558        0.329        0.351
##    67   1227       2    0.339 0.00558        0.328        0.350
##    68   1212       9    0.336 0.00560        0.326        0.348
##    69   1189       8    0.334 0.00562        0.323        0.345
##    70   1177       9    0.332 0.00564        0.321        0.343
##    71   1143      12    0.328 0.00567        0.317        0.339
##    72   1100       9    0.325 0.00570        0.314        0.337
## 
##                 REFERRAL=Yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    770       4    0.995 0.00259        0.990        1.000
##     2    766      10    0.982 0.00481        0.972        0.991
##     3    756       7    0.973 0.00587        0.961        0.984
##     4    749       4    0.968 0.00639        0.955        0.980
##     5    745       6    0.960 0.00708        0.946        0.974
##     6    739       5    0.953 0.00761        0.938        0.968
##     7    734       1    0.952 0.00771        0.937        0.967
##     8    733       3    0.948 0.00800        0.933        0.964
##     9    730       1    0.947 0.00809        0.931        0.963
##    10    729       3    0.943 0.00836        0.927        0.959
##    11    726       4    0.938 0.00871        0.921        0.955
##    12    722       3    0.934 0.00896        0.916        0.951
##    14    719       1    0.932 0.00904        0.915        0.950
##    15    718       4    0.927 0.00936        0.909        0.946
##    16    714       2    0.925 0.00951        0.906        0.944
##    17    712       1    0.923 0.00959        0.905        0.942
##    18    711       4    0.918 0.00988        0.899        0.938
##    19    707       2    0.916 0.01002        0.896        0.935
##    20    705       2    0.913 0.01016        0.893        0.933
##    21    703       6    0.905 0.01056        0.885        0.926
##    22    697       7    0.896 0.01100        0.875        0.918
##    23    690       5    0.890 0.01129        0.868        0.912
##    24    685       8    0.879 0.01174        0.857        0.903
##    25    677       6    0.871 0.01206        0.848        0.895
##    26    671       8    0.861 0.01247        0.837        0.886
##    27    663       6    0.853 0.01275        0.829        0.879
##    28    657       7    0.844 0.01307        0.819        0.870
##    29    650       8    0.834 0.01342        0.808        0.860
##    30    642      12    0.818 0.01390        0.791        0.846
##    31    630       6    0.810 0.01413        0.783        0.839
##    32    624       8    0.800 0.01441        0.772        0.829
##    33    616       5    0.794 0.01459        0.765        0.823
##    34    611       2    0.791 0.01465        0.763        0.820
##    35    609       2    0.788 0.01472        0.760        0.818
##    36    607       1    0.787 0.01475        0.759        0.816
##    38    606       1    0.786 0.01479        0.757        0.815
##    39    605       1    0.784 0.01482        0.756        0.814
##    40    604       2    0.782 0.01488        0.753        0.812
##    41    602       2    0.779 0.01495        0.750        0.809
##    42    600       7    0.770 0.01516        0.741        0.800
##    43    593       4    0.765 0.01528        0.736        0.795
##    44    589       5    0.758 0.01543        0.729        0.789
##    45    584       2    0.756 0.01548        0.726        0.787
##    46    582       5    0.749 0.01562        0.719        0.781
##    47    577       6    0.742 0.01578        0.711        0.773
##    48    571       7    0.732 0.01595        0.702        0.764
##    49    564       7    0.723 0.01612        0.692        0.756
##    50    557       8    0.713 0.01630        0.682        0.746
##    51    549       4    0.708 0.01639        0.676        0.741
##    52    545       7    0.699 0.01653        0.667        0.732
##    53    538       7    0.690 0.01667        0.658        0.723
##    54    531       2    0.687 0.01671        0.655        0.721
##    55    529       9    0.675 0.01687        0.643        0.709
##    56    514       4    0.670 0.01695        0.638        0.704
##    57    509       2    0.667 0.01698        0.635        0.702
##    58    506       4    0.662 0.01705        0.630        0.696
##    59    500       3    0.658 0.01710        0.626        0.693
##    60    492       2    0.656 0.01714        0.623        0.690
##    61    484       2    0.653 0.01717        0.620        0.687
##    69    272       1    0.650 0.01728        0.617        0.685
##    70    268       1    0.648 0.01738        0.615        0.683
##    71    260       1    0.645 0.01749        0.612        0.681

Referral == “Yes” Median is undefined as more than 50% remain after end of study.

Referral == “No” Median: 44 months

# Use survminer::ggsurvplot() to plot the survival function by REFERRAL status. Are the two curves different? 
survminer::ggsurvplot(km_ref)

# Using the survfit function, estimate the survival probabilities for the interaction between REFERRAL and NUMBER_JOBS variables in 
# surv_data. Then plot the survival curve using ggsurvplot and provide an interpretation of the curves. Does their appear to be an
# interaction?
# 
# HINT: You don't need to create a new variable!  

km_ref_job <- survival::survfit(surv_object ~ REFERRAL + NUMBER_JOBS, data = surv_data)
survminer::ggsurvplot(km_ref_job)

Exercise 3 - Fit a cox proportional hazards model to your data.

# Estimate a cox proportional hazards model. Include all of the main effects and the interactions between INTPER_RATE and PROG_EXER_RATE as well as HIRE_REC and REFERRAL. 

# HINT: An interaction variable is created by multiplying Variable 1 with Variable 2 (Variable 1 * Variable 2).

mod_1 <- survival::coxph(
  surv_object ~ NUMBER_JOBS + PROG_EXER_RATE + 
    INTPER_RATE + PRIOR_EXP + HIRE_REC*REFERRAL + INTPER_RATE*PROG_EXER_RATE,
  data = surv_data
)

# View the coefficient estimates and standard errors of the model
summary(mod_1)

## Call:
## survival::coxph(formula = surv_object ~ NUMBER_JOBS + PROG_EXER_RATE + 
##     INTPER_RATE + PRIOR_EXP + HIRE_REC * REFERRAL + INTPER_RATE * 
##     PROG_EXER_RATE, data = surv_data)
## 
##   n= 8000, number of events= 5090 
## 
##                                     coef exp(coef) se(coef)      z Pr(>|z|)    
## NUMBER_JOBS2                     0.01200   1.01207  0.03188  0.376   0.7067    
## NUMBER_JOBS3+                    0.40501   1.49932  0.04401  9.204  < 2e-16 ***
## PROG_EXER_RATEPass              -0.29343   0.74570  0.14419 -2.035   0.0418 *  
## PROG_EXER_RATEPass+             -0.47220   0.62363  0.24214 -1.950   0.0512 .  
## INTPER_RATE                     -0.10119   0.90376  0.01485 -6.816 9.38e-12 ***
## PRIOR_EXPYes                    -0.01543   0.98469  0.03225 -0.478   0.6323    
## HIRE_REC                        -0.08397   0.91946  0.01986 -4.227 2.36e-05 ***
## REFERRALYes                     -0.90587   0.40419  0.19274 -4.700 2.60e-06 ***
## HIRE_REC:REFERRALYes            -0.02592   0.97441  0.08550 -0.303   0.7618    
## PROG_EXER_RATEPass:INTPER_RATE   0.00691   1.00693  0.01700  0.406   0.6844    
## PROG_EXER_RATEPass+:INTPER_RATE  0.01875   1.01893  0.02841  0.660   0.5093    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                 exp(coef) exp(-coef) lower .95 upper .95
## NUMBER_JOBS2                       1.0121     0.9881    0.9508    1.0773
## NUMBER_JOBS3+                      1.4993     0.6670    1.3754    1.6344
## PROG_EXER_RATEPass                 0.7457     1.3410    0.5621    0.9892
## PROG_EXER_RATEPass+                0.6236     1.6035    0.3880    1.0024
## INTPER_RATE                        0.9038     1.1065    0.8778    0.9304
## PRIOR_EXPYes                       0.9847     1.0156    0.9244    1.0489
## HIRE_REC                           0.9195     1.0876    0.8843    0.9560
## REFERRALYes                        0.4042     2.4741    0.2770    0.5897
## HIRE_REC:REFERRALYes               0.9744     1.0263    0.8241    1.1522
## PROG_EXER_RATEPass:INTPER_RATE     1.0069     0.9931    0.9739    1.0411
## PROG_EXER_RATEPass+:INTPER_RATE    1.0189     0.9814    0.9637    1.0773
## 
## Concordance= 0.59  (se = 0.004 )
## Likelihood ratio test= 621.4  on 11 df,   p=<2e-16
## Wald test            = 573.6  on 11 df,   p=<2e-16
## Score (logrank) test = 596.4  on 11 df,   p=<2e-16

# Estimate a cox proportional hazards model that only includes the main effects (no interactions).
mod_2 <- survival::coxph(
  surv_object ~ NUMBER_JOBS + PROG_EXER_RATE + 
    INTPER_RATE + PRIOR_EXP + HIRE_REC + REFERRAL,
  data = surv_data
)

# View the coefficient estimates and standard errors for the main effects model. 
summary(mod_2)

## Call:
## survival::coxph(formula = surv_object ~ NUMBER_JOBS + PROG_EXER_RATE + 
##     INTPER_RATE + PRIOR_EXP + HIRE_REC + REFERRAL, data = surv_data)
## 
##   n= 8000, number of events= 5090 
## 
##                          coef exp(coef)  se(coef)       z Pr(>|z|)    
## NUMBER_JOBS2         0.011914  1.011985  0.031884   0.374    0.709    
## NUMBER_JOBS3+        0.404358  1.498340  0.043982   9.194  < 2e-16 ***
## PROG_EXER_RATEPass  -0.236436  0.789437  0.034121  -6.929 4.23e-12 ***
## PROG_EXER_RATEPass+ -0.316998  0.728333  0.055603  -5.701 1.19e-08 ***
## INTPER_RATE         -0.094790  0.909564  0.006966 -13.607  < 2e-16 ***
## PRIOR_EXPYes        -0.015768  0.984356  0.032248  -0.489    0.625    
## HIRE_REC            -0.085417  0.918129  0.019324  -4.420 9.86e-06 ***
## REFERRALYes         -0.962249  0.382033  0.062635 -15.363  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## NUMBER_JOBS2           1.0120     0.9882    0.9507    1.0772
## NUMBER_JOBS3+          1.4983     0.6674    1.3746    1.6332
## PROG_EXER_RATEPass     0.7894     1.2667    0.7384    0.8440
## PROG_EXER_RATEPass+    0.7283     1.3730    0.6531    0.8122
## INTPER_RATE            0.9096     1.0994    0.8972    0.9221
## PRIOR_EXPYes           0.9844     1.0159    0.9241    1.0486
## HIRE_REC               0.9181     1.0892    0.8840    0.9536
## REFERRALYes            0.3820     2.6176    0.3379    0.4319
## 
## Concordance= 0.59  (se = 0.004 )
## Likelihood ratio test= 620.9  on 8 df,   p=<2e-16
## Wald test            = 572.2  on 8 df,   p=<2e-16
## Score (logrank) test = 590.3  on 8 df,   p=<2e-16

# Compare the model you fit above to a model that only includes the main effects (no interactions). Use the anova() function to determine if the model with interactions fits your data significantly better than the model with main effects only. If you are unsure of how to use the anova() function, then view the help documentation on it ?anova.coxph or exercise solutions.  
anova(mod_1, mod_2)

## Analysis of Deviance Table
##  Cox model: response is  surv_object
##  Model 1: ~ NUMBER_JOBS + PROG_EXER_RATE + INTPER_RATE + PRIOR_EXP + HIRE_REC * REFERRAL + INTPER_RATE * PROG_EXER_RATE
##  Model 2: ~ NUMBER_JOBS + PROG_EXER_RATE + INTPER_RATE + PRIOR_EXP + HIRE_REC + REFERRAL
##   loglik Chisq Df P(>|Chi|)
## 1 -43245                   
## 2 -43245 0.551  3    0.9076

# Based on the model comparison above, decide if you should choose the more complicated interaction model or the less complicated main effects model. Using the model you decided on, test the proportional hazard assumption and determine if any variables violate it. If they do decide if you should drop them from the model.
survival::cox.zph(mod_2)

##                chisq df     p
## NUMBER_JOBS    2.461  2 0.292
## PROG_EXER_RATE 1.294  2 0.523
## INTPER_RATE    0.129  1 0.720
## PRIOR_EXP      3.790  1 0.052
## HIRE_REC       1.542  1 0.214
## REFERRAL       0.174  1 0.677
## GLOBAL         9.414  8 0.309

mod <- survival::coxph(
  surv_object ~ REFERRAL + NUMBER_JOBS + PROG_EXER_RATE + 
    INTPER_RATE + HIRE_REC,
  data = surv_data
)

Prior experience is close to violating the assumption and is unrelated to attrition, so we can safely drop it from the model ## Exercise 4 - Interpreting the proportional hazards results.

# Using the final model you decided on during last exercise, provide an interpretation of each of the predictors you kept in the model. 
summary(mod)

## Call:
## survival::coxph(formula = surv_object ~ REFERRAL + NUMBER_JOBS + 
##     PROG_EXER_RATE + INTPER_RATE + HIRE_REC, data = surv_data)
## 
##   n= 8000, number of events= 5090 
## 
##                          coef exp(coef)  se(coef)       z Pr(>|z|)    
## REFERRALYes         -0.962134  0.382077  0.062635 -15.361  < 2e-16 ***
## NUMBER_JOBS2         0.012290  1.012366  0.031874   0.386      0.7    
## NUMBER_JOBS3+        0.404310  1.498268  0.043982   9.193  < 2e-16 ***
## PROG_EXER_RATEPass  -0.236269  0.789568  0.034119  -6.925 4.37e-12 ***
## PROG_EXER_RATEPass+ -0.316808  0.728470  0.055602  -5.698 1.21e-08 ***
## INTPER_RATE         -0.094830  0.909528  0.006966 -13.613  < 2e-16 ***
## HIRE_REC            -0.085499  0.918054  0.019324  -4.425 9.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## REFERRALYes            0.3821     2.6173    0.3379    0.4320
## NUMBER_JOBS2           1.0124     0.9878    0.9511    1.0776
## NUMBER_JOBS3+          1.4983     0.6674    1.3745    1.6332
## PROG_EXER_RATEPass     0.7896     1.2665    0.7385    0.8442
## PROG_EXER_RATEPass+    0.7285     1.3727    0.6533    0.8123
## INTPER_RATE            0.9095     1.0995    0.8972    0.9220
## HIRE_REC               0.9181     1.0893    0.8839    0.9535
## 
## Concordance= 0.59  (se = 0.004 )
## Likelihood ratio test= 620.6  on 7 df,   p=<2e-16
## Wald test            = 571.8  on 7 df,   p=<2e-16
## Score (logrank) test = 590  on 7 df,   p=<2e-16

Referral reduces hazard by a factor of .38 (~62%). No difference between 0-1 and 2 jobs prior to hiring. Having 3+ jobs in the year before hire increases hazard by a factor of 1.50 (50%). Compared to a “Concerns” rating on the programming exercise, Pass and Pass+ both reduce hazard by factors of .79 (21%) and .73 (27%), respectively. Every unit increase in interpersonal skills reduces the hazard by a factor of .91 (9%). Every additional increase in hiring rec reduces hazard by a factor of .92 (8%).

# Use the function confint() to calculate the 95% confidence intervals for each coefficient.
# Transform the confidence intervals so that they can be interpreted as confidence intervals for exp(coef).
confint(mod)

##                          2.5 %      97.5 %
## REFERRALYes         -1.0848963 -0.83937218
## NUMBER_JOBS2        -0.0501824  0.07476316
## NUMBER_JOBS3+        0.3181076  0.49051261
## PROG_EXER_RATEPass  -0.3031419 -0.16939606
## PROG_EXER_RATEPass+ -0.4257857 -0.20783126
## INTPER_RATE         -0.1084834 -0.08117663
## HIRE_REC            -0.1233720 -0.04762507

exp(confint(mod))

##                         2.5 %    97.5 %
## REFERRALYes         0.3379368 0.4319816
## NUMBER_JOBS2        0.9510559 1.0776289
## NUMBER_JOBS3+       1.3745241 1.6331532
## PROG_EXER_RATEPass  0.7384943 0.8441745
## PROG_EXER_RATEPass+ 0.6532563 0.8123441
## INTPER_RATE         0.8971938 0.9220308
## HIRE_REC            0.8839348 0.9534912

Exercise 5 - Creating predicted survival curves from your proportional hazards model.

# Create a new data frame that contains a column for each of the variables you included in your Cox regression model. Pick of variable you are interested in and provide different values for that variable (the values need to occur in the original data frame) while holding the other variables constant (e.g. NUMBER_JOBS == "0-1" for all rows in the new data frame).
new_data <- data.frame(REFERRAL = c("No", "Yes"), 
                       NUMBER_JOBS = c("3+", "3+"), 
                       PROG_EXER_RATE = c("Concerns", "Concerns"), 
                       INTPER_RATE = c(1, 1), 
                       PRIOR_EXP = c("No", "No"), 
                       HIRE_REC = c(0, 0))

# With your new data frame and your estimated Cox regression model, use the function: survival::survfit() to create predicted survival probabilities from your model and new data frame. 
# Using summary(), explore these new probabilities. 
predicted_surv_prob <- survival::survfit(mod, newdata = new_data)
summary(predicted_surv_prob)

## Call: survfit(formula = mod, newdata = new_data)
## 
##  time n.risk n.event survival1 survival2
##     1   8000     182   0.90443     0.962
##     2   7818     181   0.81626     0.925
##     3   7637     187   0.73205     0.888
##     4   7450     135   0.67541     0.861
##     5   7315     128   0.62480     0.836
##     6   7187     110   0.58362     0.814
##     7   7077      87   0.55252     0.797
##     8   6990      68   0.52909     0.784
##     9   6922      41   0.51533     0.776
##    10   6881      58   0.49631     0.765
##    11   6823      36   0.48477     0.758
##    12   6787      35   0.47374     0.752
##    13   6752      27   0.46536     0.747
##    14   6725      29   0.45648     0.741
##    15   6696      38   0.44504     0.734
##    16   6658      41   0.43293     0.726
##    17   6617      36   0.42250     0.720
##    18   6581      46   0.40945     0.711
##    19   6535      57   0.39370     0.700
##    20   6478      69   0.37524     0.688
##    21   6409      93   0.35140     0.671
##    22   6316     117   0.32306     0.649
##    23   6199     134   0.29276     0.625
##    24   6065     124   0.26667     0.604
##    25   5941     163   0.23509     0.575
##    26   5778     218   0.19733     0.538
##    27   5560     197   0.16728     0.505
##    28   5363     199   0.14057     0.473
##    29   5164     178   0.11955     0.444
##    30   4986     160   0.10278     0.419
##    31   4826     117   0.09171     0.401
##    32   4709      94   0.08349     0.387
##    33   4615      76   0.07728     0.376
##    34   4539      51   0.07330     0.368
##    35   4488      32   0.07089     0.364
##    36   4456      15   0.06978     0.362
##    37   4441      12   0.06890     0.360
##    38   4429      16   0.06774     0.358
##    39   4413      15   0.06667     0.355
##    40   4398      26   0.06484     0.352
##    41   4372      47   0.06163     0.345
##    42   4325      55   0.05804     0.337
##    43   4270      59   0.05436     0.329
##    44   4211      59   0.05087     0.320
##    45   4152      85   0.04613     0.309
##    46   4067      64   0.04280     0.300
##    47   4003      83   0.03875     0.289
##    48   3920     100   0.03428     0.276
##    49   3820     122   0.02936     0.260
##    50   3698      97   0.02586     0.247
##    51   3601      90   0.02290     0.236
##    52   3511      91   0.02017     0.225
##    53   3420      88   0.01778     0.214
##    54   3332      88   0.01561     0.204
##    55   3244      68   0.01408     0.196
##    56   3152      57   0.01288     0.190
##    57   3088      41   0.01207     0.185
##    58   3040      39   0.01133     0.181
##    59   2951      39   0.01061     0.176
##    60   2868      18   0.01029     0.174
##    61   2813      11   0.01009     0.173
##    62   2726       2   0.01006     0.172
##    63   2645       3   0.01000     0.172
##    64   2496       3   0.00994     0.172
##    65   2353       1   0.00992     0.172
##    67   1502       2   0.00986     0.171
##    68   1485       9   0.00957     0.169
##    69   1461       9   0.00928     0.167
##    70   1445      10   0.00897     0.165
##    71   1403      13   0.00856     0.162
##    72   1348       9   0.00829     0.160

# With your new data frame and your estimated Cox regression model, use the function: survminer::ggadjustedcurves() to plot your predicted survival functions. If you are familiar with ggplot2, then try to customize the plot output. 
survminer::ggadjustedcurves(mod, 
                            data = new_data, variable = "REFERRAL", 
                            ylab = "Probability of Staying",
                            xlab = "Months Since Hire",
                            ggtheme = ggplot2::theme_minimal()) +
  ggplot2::labs(color = "Referral")

Using the results of your final Cox regression model and your predicted survival functions to write a high-level business summary of what predictor or predictors you believe to be the most important to focus on to reduce attrition.