When processing large datasets, parallel generation can significantly reduce execution time. This tutorial covers efficient batch processing strategies with localLLM.
Sequential processing with a for-loop processes one prompt at a time. Parallel processing batches multiple prompts together, sharing computation and reducing overhead.
In benchmarks, generate_parallel() typically completes
in ~65% of the time compared to sequential
generate() calls.
library(localLLM)
# Load model
model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
# Create context with batch support
ctx <- context_create(
model,
n_ctx = 2048,
n_seq_max = 10 # Allow up to 10 parallel sequences
)
# Define prompts
prompts <- c(
"What is the capital of France?",
"What is the capital of Germany?",
"What is the capital of Italy?"
)
# Format prompts
formatted_prompts <- sapply(prompts, function(p) {
messages <- list(
list(role = "system", content = "Answer concisely."),
list(role = "user", content = p)
)
apply_chat_template(model, messages)
})
# Process in parallel
results <- generate_parallel(ctx, formatted_prompts, max_tokens = 50)
print(results)#> [1] "The capital of France is Paris."
#> [2] "The capital of Germany is Berlin."
#> [3] "The capital of Italy is Rome."
Here’s a complete example classifying news articles:
library(localLLM)
# Load sample dataset
data("ag_news_sample", package = "localLLM")
# Load model
model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
# Create context (n_seq_max determines max parallel prompts)
ctx <- context_create(model, n_ctx = 1048, n_seq_max = 10)
# Prepare all prompts
all_prompts <- character(nrow(ag_news_sample))
for (i in seq_len(nrow(ag_news_sample))) {
messages <- list(
list(role = "system", content = "You are a helpful assistant."),
list(role = "user", content = paste0(
"Classify this news article into exactly one category: ",
"World, Sports, Business, or Sci/Tech. ",
"Respond with only the category name.\n\n",
"Title: ", ag_news_sample$title[i], "\n",
"Description: ", substr(ag_news_sample$description[i], 1, 100), "\n\n",
"Category:"
))
)
all_prompts[i] <- apply_chat_template(model, messages)
}
# Process all samples in parallel
results <- generate_parallel(
context = ctx,
prompts = all_prompts,
max_tokens = 5,
seed = 92092,
progress = TRUE,
clean = TRUE
)
# Extract predictions
ag_news_sample$LLM_result <- sapply(results, function(x) {
trimws(gsub("\\n.*$", "", x))
})
# Calculate accuracy
accuracy <- mean(ag_news_sample$LLM_result == ag_news_sample$class)
cat("Accuracy:", round(accuracy * 100, 1), "%\n")#> Accuracy: 87.0 %
# Sequential approach
ag_news_sample$LLM_result <- NA
ctx <- context_create(model, n_ctx = 512)
system.time({
for (i in seq_len(nrow(ag_news_sample))) {
formatted_prompt <- all_prompts[i]
output <- generate(ctx, formatted_prompt, max_tokens = 5, seed = 92092)
ag_news_sample$LLM_result[i] <- trimws(output)
}
})#> user system elapsed
#> 45.32 2.15 48.23
The simplest approach for parallel processing is passing a vector to
quick_llama():
# quick_llama automatically uses parallel mode for vectors
prompts <- c(
"Summarize: Climate change is affecting global weather patterns...",
"Summarize: The stock market reached new highs today...",
"Summarize: Scientists discovered a new species of deep-sea fish..."
)
results <- quick_llama(prompts, max_tokens = 50)
print(results)The context window is shared across parallel sequences:
Parallel processing uses more memory. Monitor with:
| Dataset Size | Recommended n_seq_max |
|---|---|
| < 100 | 4-8 |
| 100-1000 | 8-16 |
| > 1000 | 16-32 (memory permitting) |
If a prompt fails, the result will contain an error message:
library(localLLM)
# 1. Setup
model <- model_load("Llama-3.2-3B-Instruct-Q5_K_M.gguf", n_gpu_layers = 999)
ctx <- context_create(model, n_ctx = 2048, n_seq_max = 10)
# 2. Prepare prompts
data("ag_news_sample", package = "localLLM")
prompts <- sapply(seq_len(nrow(ag_news_sample)), function(i) {
messages <- list(
list(role = "system", content = "Classify news articles."),
list(role = "user", content = paste0(
"Category (World/Sports/Business/Sci/Tech): ",
ag_news_sample$title[i]
))
)
apply_chat_template(model, messages)
})
# 3. Process in batches with progress
results <- generate_parallel(
ctx, prompts,
max_tokens = 10,
seed = 42,
progress = TRUE,
clean = TRUE
)
# 4. Extract and evaluate
predictions <- sapply(results, function(x) trimws(gsub("\\n.*", "", x)))
accuracy <- mean(predictions == ag_news_sample$class)
cat("Accuracy:", round(accuracy * 100, 1), "%\n")| Function | Use Case |
|---|---|
generate() |
Single prompts, interactive use |
generate_parallel() |
Batch processing, large datasets |
quick_llama(vector) |
Quick batch processing |
explore() |
Multi-model comparison with batching |
n_seq_max when creating context
for parallel usen_ctx with
n_seq_max to give each sequence enough spaceprogress = TRUE for large
batchesclean = TRUE to automatically
strip control tokensseed for
reproducibility across batches