statuser

Miscellaneous R functions for papers, blogposts, & teaching by Uri Simonsohn.

Installation

# Install from GitHub with `groundhog` (for version control)
groundhog::groundhog.library("urisohn/statuser", date)   #date used for version control 

# Or install from GitHub with `devtools`
devtools::install_github("urisohn/statuser")

Overview

Functions I often use and are not (sufficiently?) available in existing packages.

Functions

📄 Uri’s Papers

twolines(): Two-Lines Test of U-Shapes

Implements the two-lines test for U-shaped (or inverted U-shaped) relationships introduced by Simonsohn (2018).

Reference: Simonsohn, Uri (2018) “Two lines: A valid alternative to the invalid testing of U-shaped relationships with quadratic regressions.” AMPPS, 538-555. https://doi.org/10.1177/2515245918805755

# Simple example with simulated data
set.seed(123)
x <- rnorm(100)
y <- -x^2 + rnorm(100)
data <- data.frame(x = x, y = y)
result <- twolines(y ~ x, data = data)

# With covariates
z <- rnorm(100)
y <- -x^2 + 0.5*z + rnorm(100)
data <- data.frame(x = x, y = y, z = z)
result <- twolines(y ~ x + z, data = data)

# Without data argument (variables evaluated from environment)
x <- rnorm(100)
y <- -x^2 + rnorm(100)
result <- twolines(y ~ x)

# Suppress Robin Hood details
result <- twolines(y ~ x, data = data, quiet = TRUE)

# Save plot to PNG
result <- twolines(y ~ x, data = data, pngfile = "twolines_plot.png")

📊 Graphing

plot_cdf(): CDF for multiple groups in one plot

y <- rnorm(100)
x <- rep(c("A", "B"), 50)
plot_cdf(y ~ x)  # Uses default colors
plot_cdf(y ~ x, col = c("red", "blue"))  # Custom colors

# Control group order
plot_cdf(y ~ x, order = -1)  # Reverse default order
plot_cdf(y ~ x, order = c("B", "A"))  # Custom order

plot_density(): Density for multiple groups in one plot

y <- rnorm(100)
x <- rep(c("A", "B"), 50)
plot_density(y ~ x)  # Uses default colors
plot_density(y ~ x, col = c("red", "blue"))  # Custom colors
plot_density(y ~ x, show.means = FALSE)  # Hide mean segments

# Control group order
plot_density(y ~ x, order = -1)  # Reverse default order
plot_density(y ~ x, order = c("B", "A"))  # Custom order

plot_freq(): Frequency distribution without binning, with value labels

x <- c(1, 1, 2, 2, 2, 5, 5)
plot_freq(x)

# Grouped frequency plot
df <- data.frame(value = c(1, 1, 2, 2, 2, 5, 5), group = c("A", "A", "A", "B", "B", "A", "B"))
plot_freq(value ~ group, data = df)

# Control group order in legend and plot
plot_freq(value ~ group, data = df, order = c("B", "A"))  # B first, then A
plot_freq(value ~ group, data = df, order = -1)  # Reverse default order

# Factor levels are respected automatically
df$group <- factor(df$group, levels = c("B", "A"))
plot_freq(value ~ group, data = df)  # B will appear first

# Show percentages instead of frequencies
plot_freq(value ~ group, data = df, freq = FALSE)

# Customize legend and labels
plot_freq(value ~ group, data = df, legend.title = "Treatment", col.text = "black")

plot_gam(): Plot GAM model predictions with optional distribution plot

library(mgcv)
# Fit a GAM model
data(mtcars)
model <- gam(mpg ~ s(hp) + s(wt) + factor(cyl), data = mtcars)

# Basic plot
plot_gam(model, "hp")

# With distribution plot below (auto-selects plot_freq or plot_density)
plot_gam(model, "hp", plot2 = "auto")

# Always use frequency plot
plot_gam(model, "hp", plot2 = "freq")

# Always use density plot
plot_gam(model, "hp", plot2 = "density")

# Customize colors for main plot and bottom plot
plot_gam(model, "hp", plot2 = "auto", 
         col = "blue4", bg = adjustcolor('dodgerblue', .2),  # Main plot colors
         col2 = "steelblue", bg2 = "gray95")  # Bottom plot colors

# Hold other variables at different quantile
plot_gam(model, "hp", quantile.others = 25)

scatter.gam(): Scatter plots with GAM smooth lines

x <- rnorm(100)
y <- 2*x + rnorm(100)
scatter.gam(x, y)

text2(): Adds to text(): align=‘center’ , bg=‘yellow’

plot(1:10, 1:10, type = "n")
   text2(2, 8, "Left", align = "left", bg = "lightblue")
   text2(5, 8, "Center", align = "center", bg = "lightgreen")
   text2(8, 8, "Right",    align = "right", bg = "lightyellow")
   text2(5, 5, "Red Text", col = "red", bg = "white")

resize_images(): Saves any image (or all in folder) as PNG with set width.

# Resize a single image file
   resize_images("path/to/image.svg", width = 800)

# Resize all images in a folder to 800px width
   resize_images("path/to/images", width = 800)

# Resize images to different widths
   resize_images("path/to/images", width = c(800, 1200, 600))

📈 Statistical Analyses

table2(): Enhances base table(): (1) variable names are shown, (2) proportions are an option

df <- data.frame(
  group = c("A", "A", "B", "B", "A"),
  status = c("X", "Y", "X", "Y", "X")
)

# table() does not show var names, table2() does
table (df$group, df$status)
table2(df$group, df$status)

# can report proportinos (building in prop.table() )
table2(df$group, df$status, prop = "all")    # Overall proportions
table2(df$group, df$status, prop = "row")    # Row proportions
table2(df$group, df$status, prop = "column") # Column proportions

desc_var(): Comprehensive variable summary stats, optionally by grouping variable(s)

# Why use desc_var() instead of psych::describeBy()?
# - Returns a single dataframe (not a list) - easier to export, filter, merge
# - Shows mode statistics (most frequent values) - useful for discrete data
# - Counts missing values automatically
# - Columns are labeled for easy interpretation
# - Supports multiple grouping variables with formula syntax
# - Results are sorted by grouping variables

# With grouping - compare groups side-by-side
df <- data.frame(score = rnorm(100), condition = rep(c("Control", "Treatment"), 50))
desc_var(score, condition, data = df)

# Formula syntax (single grouping variable)
desc_var(score ~ condition, data = df)

# Multiple grouping variables - results sorted by all grouping variables
df2 <- data.frame(
  score = rnorm(200),
  x1 = rep(c("A", "B"), 100),
  x2 = rep(c("men", "women"), each = 100),
  x3 = sample(1:3, replace = TRUE, size = 200)
)
desc_var(score ~ x1 + x2 + x3, data = df2)

# Without grouping - get stats for full dataset
desc_var(score, data = df)

# Direct vectors (no data frame needed)
scores <- rnorm(100)
groups <- rep(c("A", "B"), 50)
desc_var(scores, groups)

# Custom decimal places for cleaner output
desc_var(score, condition, data = df, decimals = 2)

lm2(): Enhanced linear regression with robust SEs, standardized coefficients, and diagnostics

# Basic usage - uses HC3 robust standard errors by default
lm2(mpg ~ wt + hp, data = mtcars)

# Works without data argument if variables exist in environment
y <- rnorm(100)
x1 <- rnorm(100)
x2 <- rnorm(100)
lm2(y ~ x1 + x2)

# Output includes:
# - Robust and classical standard errors side-by-side
# - Standardized coefficients (effect.size)
# - Missing value counts per variable
# - Red flags when robust and classical SEs differ substantially

# Get estimatr's native output instead
lm2(mpg ~ wt + hp, data = mtcars, output = "estimatr")

# Use different robust SE type (HC0, HC1, HC2, HC3)
lm2(mpg ~ wt + hp, data = mtcars, se_type = "HC2")

t.test2(): Enhances base t.test: (1) console shows mean diff & var names, (2) output is dataframe, not list

# Data for example
    men <- rnorm(100, mean = 5, sd = 1)
    women <- rnorm(100, mean = 4.8, sd = 1)

# t.test() is harder to interpret, does not show difference of means(!) or indicate which mean is subtraced from which

    t.test(men, women) 
    t.test2(men, women)

# Formula syntax
    data <- data.frame(y = rnorm(100), group = rep(c("A", "B"), 50))
    t2 = t.test2(y ~ group, data = data)  # Columns: A, B, A-B, SE_A-B, conf.intL, conf.intH, level, t, df, p.value, method, se_A, se_B
    t2

# Formula syntax without data argument
    x1=rnorm(100)
    x2=rnorm(100)
    y <- c(x1, x2)
    condition <- rep(c('A', 'B'), c(length(x1), length(x2)))
    t.test2(y ~ condition)

✨ Formatting

format_pvalue(): Format p-values for clean display in figures and tables (e.g., p<.0001)

format_pvalue(0.05)
format_pvalue(0.0001, include_p = TRUE)

message2(): Print colored messages to console

message2("This is a red message", col = "red", font = 2)
message2("This is a cyan message", col = "cyan")

🗂️ Miscellaneous

list2(): Create lists with objects without having to name them

    x <- 1:5
    y <- letters[1:3]
    z <- matrix(1:4, nrow = 2)
    list2(x, y, z)

convert_to_sql(): Convert CSV to SQL

    convert_to_sql("data.csv", "data.sql")

clear(): Clear plot, global environment, and console

Clears plot, global environment, and console. On first use you are prompted to authorize (one-time; your choice is saved).

# Create some objects
    x <- 1:10
    y <- rnorm(10)
    plot(x, y)

# Clear everything (first run may prompt for one-time permission)
    clear()

Dependencies

mgcv (for scatter.gam(), plot_gam(), and twolines())
rsvg (for resize_images())
magick (for resize_images())
sandwich (for twolines())
lmtest (for twolines())
estimatr (for lm2())

Author

Uri Simonsohn
Email: urisohn@gmail.com

License

GPL-3

Version

Release: 0.1.8
Development: 0.1.8.9001