R

Personal R tricks and code snippets.

R vs. Python

R Python
count() value_counts()
filter .loc[]
skim .describe()

Browser for shiny apps

echo "R_BROWSER=zen-browser" >> ~/.Renviron

Excel

library(readxl) df <- read_excel("supplement-v2-for_BNX.xlsx", sheet = "Case List", skip = 5, n_max = 26)

Interpreter

source("script.R")
savehistory("script.R")

Linter

library(lintr)
lint(filename="analyze.R")

Data Cleaning

Readable Names

colnames(df) <- make.names(str_trim(colnames(df)))

Combine columns string str_glue

df <- df %>%
  mutate(
    new_column = str_glue("{column1} ({column2}–{column3}) [{column4}±{column5}]")
  )

Rmarkdown

render

Plots

=== Order category values ==

library(forcats)
df %>%
  ggplot( aes( x = fct_rev(fct_reorder(angiolog, groin.to.open, median)), y = groin.to.open)) +
  geom_boxplot() +
  ggtitle("Groin to open podle angiologa") +
  ylab("groin to open [min]") +
  xlab("angiolog")

Add p-values

library(ggpubr)

ggboxplot( df, x = "group", y = "age", color = "group", add = "jitter") +
    stat_compare_means()

To change the test: stat_compare_means( method = 't.test' ). Method can be t.test, wilcox.test, anova, kruskal.test.

To get pairwise comparisons:

comparisons <- list( c("group1", "group2"), c("group2", "group3"), c("group1", "group3" ))
ggboxplot( df, x = "group", y = "age", color = "group", add = "jitter" ) +
    stat_compare_means( comparisons = comparisons, label.y = c(58, 60, 62) ) +
    stat_compare_means( method = 'kruskal.test', label.y = 66 )

First stat_compare_means does pairwise comparison, second stat_compare_means does global test.

Add counts to bar plot

df %>% 
    ggplot( aes( x = as.factor(rok) ) ) + 
    geom_bar() +
    stat_count(geom = "text", size = 4, aes(label = ..count..), position=position_stack(vjust=1.15)) +
    facet_grid(sluzba~.) +
    ggtitle("Počet rekanalizací (IVT+MT) v jednotlivých letech dle pracovní doby") +
    ylab("počet") +
    xlab("rok")

stringr error

Error in dyn.load(file, DLLpath = DLLpath, ...) : unable to load shared object '/home/pavel/R/x86_64-pc-linux-gnu-library/4.4/stringi/libs/stringi.so'

to solve: install.packages("stringi")

timeout error

getOption('timeout')
options(timeout=120)

xml error

unable to load shared object '/home/pavel/R/x86_64-pc-linux-gnu-library/4.5/xml2/libs/xml2.so'

sudo pacman -S libxml2-legacy

substring

df <- df %>%
    mutate(
        tier = ifelse(
            `STIM TYPE` == "SHAM",
            "SHAM",
            substr( `Study ID`, start = 1, stop = 1)
        )
    ) %>%
    drop_na(`Study ID`)

Description

Descriptive statistics by group:

library(skimr)
df %>% group_by(group) %>% skim()
 df %>% 
    summarize(
              "DTG_mean" = mean(door.to.groin, na.rm = TRUE),
              "DTG_sd" = sd(door.to.groin, na.rm = TRUE),
              "DTG_median" = median(door.to.groin, na.rm = TRUE),
              "DTG_Q1" = quantile(door.to.groin, p = 0.25, na.rm = TRUE),
              "DTG_Q3" = quantile(door.to.groin, p = 0.75, na.rm = TRUE),

              "GTO_mean" = mean(groin.to.open, na.rm = TRUE),
              "GTO_sd" = sd(groin.to.open, na.rm = TRUE),
              "GTO_median" = median(groin.to.open, na.rm = TRUE),
              "GTO_Q1" = quantile(groin.to.open, p = 0.25, na.rm = TRUE),
              "GTO_Q3" = quantile(groin.to.open, p = 0.75, na.rm = TRUE),

              "DTN_mean" = mean(door.to.needle, na.rm = TRUE),
              "DTN_sd" = sd(door.to.needle, na.rm = TRUE),
              "DTN_median" = median(door.to.needle, na.rm = TRUE),
              "DTN_Q1" = quantile(door.to.needle, p = 0.25, na.rm = TRUE),
              "DTN_Q3" = quantile(door.to.needle, p = 0.75, na.rm = TRUE),
    ) %>%
    mutate(
        door_to_groin = str_glue("{DTG_median} ({DTG_Q1}–{DTG_Q3}) [{DTG_mean}±{DTG_sd}]"),
        groin_to_open = str_glue("{GTO_median} ({GTO_Q1}–{GTO_Q3}) [{GTO_mean}±{GTO_sd}]"),
        door_to_needle = str_glue("{DTN_median} ({DTN_Q1}–{DTN_Q3}) [{DTN_mean}±{DTN_sd}]"),
    ) %>% 
    select(door_to_groin, groin_to_open, door_to_needle)

Kruskal-Wallis & Dunn test

{{R kruskal.test(dNIHSS90 ~ tier, data = df) %>% print()

#! one-sided p-values!!! library(dunn.test) dunn.test(df\(dNIHSS90, df\)tier) dunn.test(df\(dNIHSS90, df\)tier, kw=TRUE, method="bonferroni")

#two sided p-values!!! library(FSA) dunnTest( dNIHSS90 ~ tier, df) dunnTest( dNIHSS90 ~ tier, df, method="bonferroni") }}

Fisher test

test <- df %>% group_by(group) %>% select( group, sex ) %>% table() %>% fisher.test()

library(broom)
tidy(test)$p.value

ANOVA

test <- aov(age ~ group, df)

library(broom)
p <- tidy(test)$p.value

Kruskall-Wallis

`test <- kruskal.test( age ~ group, df )

library(broom) p <- tidy(test)$p.value`

Multiple comparisons

fisher_p_values <- tibble(
    test = c( "humoral_month3", "humoral_month12", "cellular_month3", "cellular_month12"),
    p_value = c(
        response %>%
        filter( month == 3 ) %>%
        select( group, humoral_response ) %>%
        table() %>%
        fisher.test() %>%
        tidy() %>%
        select(p.value) %>%
        pull(),

        response %>%
        filter( month == 12 ) %>%
        select( group, humoral_response ) %>%
        table() %>%
        fisher.test() %>%
        tidy() %>%
        select(p.value)
        %>% pull(),

        response %>%
        filter( month == 3 ) %>%
        select( group, cellular_response ) %>%
        table() %>%
        fisher.test() %>%
        tidy() %>%
        select(p.value) %>%
        pull(),

        response %>%
        filter( month == 12 ) %>%
        select( group, cellular_response ) %>%
        table() %>%
        fisher.test() %>%
        tidy() %>%
        select(p.value) %>%
        pull()
    )) %>% mutate(
        Bonferroni = p.adjust( p_value, method = "bonferroni"),
        Benjamini_Hochberg = p.adjust( p_value, method = "BH" ),
    )

Libraries

library(tidyverse)
library(readxl)
library(lubridate)
library(ggplot2)
library(lme4)
library(brms)
library(car)
library(skimr)
library(broom)
library(ggpubr)

print(version)
packageVersion("tidyverse")
packageVersion("readxl")
packageVersion("lubridate")
packageVersion("ggplot2")
packageVersion("lme4")
packageVersion("brms")
packageVersion("car")
packageVersion("skimr")
packageVersion("broom")
packageVersion("ggpubr")

Wide to long format

df_long <- df %>%
    pivot_longer(
	cols = starts_with( "humoral_" ),
	names_to = "month",
	names_prefix = "humoral_M",
	values_to = "humoral_response"
    ) %>%
    mutate( month = as.numeric(month) )

Joins

df_joined <- df1 %>%
    inner_join(
        df2,
        by = join_by( subject_id == subject_id, month == month, group == group, sex == sex, age == age )
    ) %>% select(
           subject_id,
           group,
           sex,
           age,
           month,
           cellular_response,
           humoral_response,
   )

Mixed-effects model

Linear regression

library(lme4)
model <- lmer( outcome ~ group + month + age + sex + (1 | subject_id), data = df )
Anova(model)

Logistic regression

Subject_id as a random effect:

library(lme4)
model <- glmer( binary_outcome ~ group + month + age + sex + (1 | subject_id), data = df, family = binomial )
Anova(model)

Bayesian regression model

Linear regression

library(brms)
model <- brm( outcome ~ group + sex + age + month + (1|subject_id), data = response )
plot(model)
posterior_interval(model)

Logistic regression

library(brms)
model <- brm( binary_outcome ~ group + sex + age + month + (1|subject_id), family = bernoulli, data = response )
plot(model)
posterior_interval(model)