osmextract benchmark (companion to benchmarks_scaling.ipynb)#
osmextract is an R package, so its parsing runs here (with the R kernel) rather than inside the Python benchmarks_scaling.ipynb. It benchmarks buildings and the road network across the same area ladder the Python benchmark prepares (scaling_areas.csv), one area per cell, and writes the rows into osmextract_scaling_results.csv next to this file; benchmarks_scaling.ipynb loads that file and merges the osmextract rows into its results table and charts.
Run this with an R kernel (see the installation snippet in benchmarks_scaling.ipynb). It is kept fair the same way as the Python tools: each read runs in a separate Rscript process (bench_worker.R) wrapped with /usr/bin/time, so its time and peak memory (peak resident set size) are captured the same way as the psutil peak the Python tools record, and force_vectortranslate = TRUE re-does the PBF → features work each run (the osmextract analogue of QuackOSM’s ignore_cache) rather than reading a cached GeoPackage. Each result is upserted into the CSV as it completes — re-running an area’s cell replaces just that row.
# Same input as the Python benchmark (get_data("helsinki")): the latest Helsinki extract from
# BBBike. osmextract fetches it itself via oe_get(provider = "bbbike") -- one of its core
# capabilities. download_only + skip_vectortranslate just download the .osm.pbf and return its
# path; it is used for the version banner and as a fallback source for clipping Kamppi.
suppressPackageStartupMessages(library(osmextract))
REPEATS <- 3L
options(timeout = max(600, getOption("timeout"))) # the ~62 MB download can exceed R's 60 s default
pbf <- oe_get("Helsinki", provider = "bbbike",
download_only = TRUE, skip_vectortranslate = TRUE, quiet = FALSE)
cat("osmextract", as.character(packageVersion("osmextract")),
"| sf", as.character(packageVersion("sf")), "\nPBF:", pbf, "\n")
The input place was matched with: Helsinki
The chosen file was already detected in the download directory. Skip downloading.
osmextract 0.6.0 | sf 1.1.0
PBF: /Users/tenkanh2/Library/Application Support/org.R-project.R/R/osmextract/bbbike_Helsinki.osm.pbf
Buildings across area sizes#
Each area below runs in its own cell, mirroring benchmarks_scaling.ipynb. Running an area cell benchmarks osmextract on that area’s PBF (time and peak memory) and appends its row to osmextract_scaling_results.csv; re-running a cell replaces just that area’s row. Run the setup cell first, then the area cells you want.
# Buildings setup: read the manifest, define the per-area benchmark (time + peak memory, measured
# in a separate process like the Python tools), and accumulate results so each area can be run --
# and re-run -- in its own cell below.
areas <- read.csv("scaling_areas.csv", stringsAsFactors = FALSE)
worker <- normalizePath("bench_worker.R", mustWork = FALSE)
if (!file.exists(worker)) stop("bench_worker.R must sit next to this notebook")
time_flag <- if (Sys.info()[["sysname"]] == "Darwin") "-l" else "-v"
time_bin <- if (file.exists("/usr/bin/time")) "/usr/bin/time" else "time"
rscript <- file.path(R.home("bin"), "Rscript")
if (!file.exists(rscript)) rscript <- Sys.which("Rscript")
if (!nzchar(rscript) || !file.exists(rscript)) stop("Could not find Rscript for this R installation")
find_python <- function() {
candidates <- c(Sys.which("python"), file.path(dirname(dirname(R.home())), "bin", "python"))
candidates <- candidates[nzchar(candidates) & file.exists(candidates)]
if (length(candidates) == 0) return(NA_character_)
candidates[1]
}
rebuild_kamppi_pbf <- function() {
out <- file.path(tempdir(), "kamppi.osm.pbf")
if (file.exists(out)) return(normalizePath(out))
if (!exists("pbf") || !file.exists(pbf)) {
stop("Kamppi PBF is missing and the Helsinki PBF from oe_get() is not available; re-run the first cell")
}
python <- find_python()
if (is.na(python)) {
stop("Kamppi PBF is missing and Python was not found; re-run the setup cell in benchmarks_scaling.ipynb")
}
script <- paste(
"from pyrosm import OSM",
"import sys",
"OSM(sys.argv[1], bounding_box=[24.920, 60.162, 24.942, 60.172]).to_pbf(sys.argv[2])",
sep = "\n"
)
log <- suppressWarnings(system2(python, c("-c", script, pbf, out), stdout = TRUE, stderr = TRUE))
if (!file.exists(out)) {
detail <- paste(tail(log[nzchar(log)], 8), collapse = " | ")
stop(sprintf("Could not rebuild Kamppi PBF: %s", detail))
}
out
}
resolve_area_pbf <- function(area_label, pbf_path) {
if (file.exists(pbf_path)) return(normalizePath(pbf_path))
if (identical(area_label, "Kamppi")) return(rebuild_kamppi_pbf())
stop(sprintf("PBF for '%s' not found: %s. Re-run the setup cell in benchmarks_scaling.ipynb.",
area_label, pbf_path))
}
parse_peak_rss_mb <- function(output_lines) {
text <- paste(output_lines, collapse = "\n")
# macOS `/usr/bin/time -l`: "<bytes> maximum resident set size"
m <- regmatches(text, regexpr("[0-9]+ +maximum resident set size", text))
if (length(m) == 1) return(as.numeric(sub(" .*", "", m)) / 1e6) # bytes -> MB
# GNU `/usr/bin/time -v`: "Maximum resident set size (kbytes): <kb>"
m <- regmatches(text, regexpr("Maximum resident set size \\(kbytes\\): [0-9]+", text))
if (length(m) == 1) return(as.numeric(sub(".*: ", "", m)) / 1e3) # kB -> MB
NA_real_
}
bench_oe <- function(worker_task, csv_task, area, pbf, repeats) {
runtimes <- numeric(0); peaks <- numeric(0); n <- NA_integer_; status <- "ok"
for (i in seq_len(repeats)) {
out <- tryCatch(suppressWarnings(system2(time_bin, c(time_flag, rscript, worker, worker_task, pbf),
stdout = TRUE, stderr = TRUE)),
error = function(e) {
status <<- paste("error:", conditionMessage(e)); character(0)
})
if (length(out) == 0) break
data_line <- grep("^[0-9.]+,[0-9]+$", out, value = TRUE) # "seconds,features" from the worker
if (length(data_line) == 0) {
detail <- paste(tail(out[nzchar(out)], 8), collapse = " | ")
status <- if (nzchar(detail)) paste("error:", detail) else "error: no worker output"
break
}
parts <- as.numeric(strsplit(data_line[length(data_line)], ",")[[1]])
runtimes <- c(runtimes, parts[1]); n <- as.integer(parts[2])
peaks <- c(peaks, parse_peak_rss_mb(out))
}
if (length(runtimes) == 0)
return(data.frame(task = csv_task, area = area, tool = "osmextract",
seconds = NA_real_, peak_mb = NA_real_, features = NA_integer_,
status = status, stringsAsFactors = FALSE))
data.frame(task = csv_task, area = area, tool = "osmextract",
seconds = round(median(runtimes), 2), peak_mb = round(median(peaks), 1),
features = n, status = "ok", stringsAsFactors = FALSE)
}
# Accumulated results across the area cells (one row per area). Re-running this setup cell keeps
# whatever has already been collected.
if (!exists("oe_scaling_results")) oe_scaling_results <- data.frame()
# Upsert one (task, area) result into oe_scaling_results and write the CSV immediately -- so each
# tool/area is exported as it finishes, and re-running an area replaces just that row.
run_oe_for <- function(worker_task, csv_task, area_label) {
row <- areas[areas$area == area_label, ]
if (nrow(row) == 0) stop(sprintf("area '%s' not in scaling_areas.csv", area_label))
pbf_path <- resolve_area_pbf(row$area[1], row$path[1])
result <- bench_oe(worker_task, csv_task, row$area[1], pbf_path, row$repeats[1])
keep <- !(oe_scaling_results$task == csv_task & oe_scaling_results$area == area_label)
oe_scaling_results <<- rbind(oe_scaling_results[keep, ], result)
write.csv(oe_scaling_results, "osmextract_scaling_results.csv", row.names = FALSE)
print(result)
}
run_oe_buildings <- function(area_label) run_oe_for("buildings", "buildings", area_label)
run_oe_network <- function(area_label) run_oe_for("network", "roads", area_label)
# Kamppi (smallest area)
run_oe_buildings("Kamppi")
task area tool seconds peak_mb features status
1 buildings Kamppi osmextract 0.06 349.6 663 ok
# Helsinki region
run_oe_buildings("Helsinki region")
task area tool seconds peak_mb features status
1 buildings Helsinki region osmextract 3.99 1095.8 176875 ok
# New York City
run_oe_buildings("New York City")
task area tool seconds peak_mb features status
1 buildings New York City osmextract 26.08 5018.3 1600076 ok
# Paris
run_oe_buildings("Paris")
task area tool seconds peak_mb features status
1 buildings Paris osmextract 27.2 5575.1 1888259 ok
# Finland
run_oe_buildings("Finland")
task area tool seconds peak_mb features status
1 buildings Finland osmextract 76.19 7793.7 3146667 ok
# Spain
run_oe_buildings("Spain")
task area tool seconds peak_mb features status
1 buildings Spain osmextract 154.43 11905.5 5557273 ok
Road network across area sizes#
The same per-area benchmark for the road network (osmextract’s lines layer, highway IS NOT NULL). Each cell records osmextract’s time and peak memory for one area and upserts its row (task = roads) into osmextract_scaling_results.csv. Run the setup cell first.
# Kamppi (smallest area)
run_oe_network("Kamppi")
task area tool seconds peak_mb features status
1 roads Kamppi osmextract 0.07 350.9 4716 ok
# Helsinki region (small city)
run_oe_network("Helsinki region")
task area tool seconds peak_mb features status
1 roads Helsinki region osmextract 3.14 1022.3 296845 ok
# New York City (large city)
run_oe_network("New York City")
task area tool seconds peak_mb features status
1 roads New York City osmextract 6.65 1584.9 669037 ok
# Paris (larger city)
run_oe_network("Paris")
task area tool seconds peak_mb features status
1 roads Paris osmextract 7.34 1503.3 620227 ok
# Finland (country)
run_oe_network("Finland")
task area tool seconds peak_mb features status
1 roads Finland osmextract 32.57 4657.9 2178539 ok
# Spain (largest country)
run_oe_network("Spain")
task area tool seconds peak_mb features status
1 roads Spain osmextract 114.19 10010.6 5716619 ok