R#

This guide covers using R for statistical computing and data analysis on NMTHPC.

Loading R#

$ module avail r
$ module load r/4.3.0

Verify installation:

$ R --version
$ which R

Running R#

Interactive R Session#

On compute node (recommended):

$ srun --mem=16G --time=02:00:00 --pty bash
$ module load r/4.3.0
$ R

In R session:

> print("Hello from NMTHPC")
> q()  # Quit R

R Scripts#

Create script (analysis.R):

# Load libraries
library(ggplot2)
library(dplyr)

# Read data
data <- read.csv("data.csv")

# Analysis
summary_stats <- data %>%
  group_by(category) %>%
  summarize(mean_value = mean(value))

# Plot
plot <- ggplot(data, aes(x=category, y=value)) +
  geom_boxplot()

# Save
ggsave("plot.png", plot, width=8, height=6)
write.csv(summary_stats, "results.csv")

Run script non-interactively:

$ R CMD BATCH analysis.R
$ Rscript analysis.R

Batch R Jobs#

SLURM script:

#!/bin/bash
#SBATCH --job-name=r_analysis
#SBATCH --output=r_job_%j.out
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem=32G
#SBATCH --time=04:00:00

module load r/4.3.0

Rscript analysis.R

Installing R Packages#

Installing to User Library#

In R:

# Install packages
install.packages("ggplot2", repos="https://cloud.r-project.org")
install.packages(c("dplyr", "tidyr", "readr"))

Packages install to ~/R/x86_64-pc-linux-gnu-library/

Installing from GitHub#

# Install devtools first
install.packages("devtools")

# Install from GitHub
devtools::install_github("username/package")

Bioconductor Packages#

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("DESeq2")

Parallel R#

Using Multiple Cores#

parallel package:

library(parallel)

# Detect cores
num_cores <- as.integer(Sys.getenv("SLURM_CPUS_PER_TASK", "1"))

# Parallel apply
results <- mclapply(1:100, function(x) {
  # Your computation
  x^2
}, mc.cores=num_cores)

SLURM script:

#!/bin/bash
#SBATCH --cpus-per-task=16
#SBATCH --mem=64G

module load r/4.3.0
Rscript parallel_script.R

foreach and doParallel#

library(foreach)
library(doParallel)

num_cores <- as.integer(Sys.getenv("SLURM_CPUS_PER_TASK", "1"))
registerDoParallel(cores=num_cores)

results <- foreach(i=1:100, .combine='c') %dopar% {
  # Your computation
  i^2
}

stopImplicitCluster()

Common R Workflows#

Data Analysis#

library(dplyr)
library(ggplot2)

# Load data
data <- read.csv("dataset.csv")

# Clean and transform
clean_data <- data %>%
  filter(!is.na(value)) %>%
  mutate(log_value = log(value)) %>%
  group_by(category) %>%
  summarize(
    mean_val = mean(value),
    sd_val = sd(value)
  )

# Visualize
ggplot(clean_data, aes(x=category, y=mean_val)) +
  geom_bar(stat="identity") +
  geom_errorbar(aes(ymin=mean_val-sd_val, ymax=mean_val+sd_val))

ggsave("results.png", width=10, height=6, dpi=300)

Statistical Modeling#

# Linear regression
model <- lm(y ~ x1 + x2 + x3, data=mydata)
summary(model)

# Save model
saveRDS(model, "model.rds")

# Later: load model
loaded_model <- readRDS("model.rds")