osmextract benchmark (companion to benchmarks_scaling.ipynb)

osmextract benchmark (companion to benchmarks_scaling.ipynb)#

osmextract is an R package, so its parsing runs here (with the R kernel) rather than inside the Python benchmarks_scaling.ipynb. It benchmarks buildings and the road network across the same area ladder the Python benchmark prepares (scaling_areas.csv), one area per cell, and writes the rows into osmextract_scaling_results.csv next to this file; benchmarks_scaling.ipynb loads that file and merges the osmextract rows into its results table and charts.

Run this with an R kernel (see the installation snippet in benchmarks_scaling.ipynb). It is kept fair the same way as the Python tools: each read runs in a separate Rscript process (bench_worker.R) wrapped with /usr/bin/time, so its time and peak memory (peak resident set size) are captured the same way as the psutil peak the Python tools record, and force_vectortranslate = TRUE re-does the PBF → features work each run (the osmextract analogue of QuackOSM’s ignore_cache) rather than reading a cached GeoPackage. Each result is upserted into the CSV as it completes — re-running an area’s cell replaces just that row.

# Same input as the Python benchmark (get_data("helsinki")): the latest Helsinki extract from
# BBBike. osmextract fetches it itself via oe_get(provider = "bbbike") -- one of its core
# capabilities. download_only + skip_vectortranslate just download the .osm.pbf and return its
# path; it is used for the version banner and as a fallback source for clipping Kamppi.
suppressPackageStartupMessages(library(osmextract))
REPEATS <- 3L
options(timeout = max(600, getOption("timeout")))  # the ~62 MB download can exceed R's 60 s default

pbf <- oe_get("Helsinki", provider = "bbbike",
              download_only = TRUE, skip_vectortranslate = TRUE, quiet = FALSE)

cat("osmextract", as.character(packageVersion("osmextract")),
    "| sf", as.character(packageVersion("sf")), "\nPBF:", pbf, "\n")
The input place was matched with: Helsinki

The chosen file was already detected in the download directory. Skip downloading.
osmextract 0.6.0 | sf 1.1.0 
PBF: /Users/tenkanh2/Library/Application Support/org.R-project.R/R/osmextract/bbbike_Helsinki.osm.pbf 

Buildings across area sizes#

Each area below runs in its own cell, mirroring benchmarks_scaling.ipynb. Running an area cell benchmarks osmextract on that area’s PBF (time and peak memory) and appends its row to osmextract_scaling_results.csv; re-running a cell replaces just that area’s row. Run the setup cell first, then the area cells you want.

# Buildings setup: read the manifest, define the per-area benchmark (time + peak memory, measured
# in a separate process like the Python tools), and accumulate results so each area can be run --
# and re-run -- in its own cell below.
areas <- read.csv("scaling_areas.csv", stringsAsFactors = FALSE)
worker <- normalizePath("bench_worker.R", mustWork = FALSE)
if (!file.exists(worker)) stop("bench_worker.R must sit next to this notebook")
time_flag <- if (Sys.info()[["sysname"]] == "Darwin") "-l" else "-v"
time_bin <- if (file.exists("/usr/bin/time")) "/usr/bin/time" else "time"
rscript <- file.path(R.home("bin"), "Rscript")
if (!file.exists(rscript)) rscript <- Sys.which("Rscript")
if (!nzchar(rscript) || !file.exists(rscript)) stop("Could not find Rscript for this R installation")


find_python <- function() {
    candidates <- c(Sys.which("python"), file.path(dirname(dirname(R.home())), "bin", "python"))
    candidates <- candidates[nzchar(candidates) & file.exists(candidates)]
    if (length(candidates) == 0) return(NA_character_)
    candidates[1]
}


rebuild_kamppi_pbf <- function() {
    out <- file.path(tempdir(), "kamppi.osm.pbf")
    if (file.exists(out)) return(normalizePath(out))
    if (!exists("pbf") || !file.exists(pbf)) {
        stop("Kamppi PBF is missing and the Helsinki PBF from oe_get() is not available; re-run the first cell")
    }
    python <- find_python()
    if (is.na(python)) {
        stop("Kamppi PBF is missing and Python was not found; re-run the setup cell in benchmarks_scaling.ipynb")
    }
    script <- paste(
        "from pyrosm import OSM",
        "import sys",
        "OSM(sys.argv[1], bounding_box=[24.920, 60.162, 24.942, 60.172]).to_pbf(sys.argv[2])",
        sep = "\n"
    )
    log <- suppressWarnings(system2(python, c("-c", script, pbf, out), stdout = TRUE, stderr = TRUE))
    if (!file.exists(out)) {
        detail <- paste(tail(log[nzchar(log)], 8), collapse = " | ")
        stop(sprintf("Could not rebuild Kamppi PBF: %s", detail))
    }
    out
}


resolve_area_pbf <- function(area_label, pbf_path) {
    if (file.exists(pbf_path)) return(normalizePath(pbf_path))
    if (identical(area_label, "Kamppi")) return(rebuild_kamppi_pbf())
    stop(sprintf("PBF for '%s' not found: %s. Re-run the setup cell in benchmarks_scaling.ipynb.",
                 area_label, pbf_path))
}


parse_peak_rss_mb <- function(output_lines) {
    text <- paste(output_lines, collapse = "\n")
    # macOS `/usr/bin/time -l`: "<bytes>  maximum resident set size"
    m <- regmatches(text, regexpr("[0-9]+ +maximum resident set size", text))
    if (length(m) == 1) return(as.numeric(sub(" .*", "", m)) / 1e6)         # bytes -> MB
    # GNU `/usr/bin/time -v`: "Maximum resident set size (kbytes): <kb>"
    m <- regmatches(text, regexpr("Maximum resident set size \\(kbytes\\): [0-9]+", text))
    if (length(m) == 1) return(as.numeric(sub(".*: ", "", m)) / 1e3)        # kB -> MB
    NA_real_
}


bench_oe <- function(worker_task, csv_task, area, pbf, repeats) {
    runtimes <- numeric(0); peaks <- numeric(0); n <- NA_integer_; status <- "ok"
    for (i in seq_len(repeats)) {
        out <- tryCatch(suppressWarnings(system2(time_bin, c(time_flag, rscript, worker, worker_task, pbf),
                                                stdout = TRUE, stderr = TRUE)),
                        error = function(e) {
                            status <<- paste("error:", conditionMessage(e)); character(0)
                        })
        if (length(out) == 0) break
        data_line <- grep("^[0-9.]+,[0-9]+$", out, value = TRUE)  # "seconds,features" from the worker
        if (length(data_line) == 0) {
            detail <- paste(tail(out[nzchar(out)], 8), collapse = " | ")
            status <- if (nzchar(detail)) paste("error:", detail) else "error: no worker output"
            break
        }
        parts <- as.numeric(strsplit(data_line[length(data_line)], ",")[[1]])
        runtimes <- c(runtimes, parts[1]); n <- as.integer(parts[2])
        peaks <- c(peaks, parse_peak_rss_mb(out))
    }
    if (length(runtimes) == 0)
        return(data.frame(task = csv_task, area = area, tool = "osmextract",
                          seconds = NA_real_, peak_mb = NA_real_, features = NA_integer_,
                          status = status, stringsAsFactors = FALSE))
    data.frame(task = csv_task, area = area, tool = "osmextract",
               seconds = round(median(runtimes), 2), peak_mb = round(median(peaks), 1),
               features = n, status = "ok", stringsAsFactors = FALSE)
}


# Accumulated results across the area cells (one row per area). Re-running this setup cell keeps
# whatever has already been collected.
if (!exists("oe_scaling_results")) oe_scaling_results <- data.frame()


# Upsert one (task, area) result into oe_scaling_results and write the CSV immediately -- so each
# tool/area is exported as it finishes, and re-running an area replaces just that row.
run_oe_for <- function(worker_task, csv_task, area_label) {
    row <- areas[areas$area == area_label, ]
    if (nrow(row) == 0) stop(sprintf("area '%s' not in scaling_areas.csv", area_label))
    pbf_path <- resolve_area_pbf(row$area[1], row$path[1])
    result <- bench_oe(worker_task, csv_task, row$area[1], pbf_path, row$repeats[1])
    keep <- !(oe_scaling_results$task == csv_task & oe_scaling_results$area == area_label)
    oe_scaling_results <<- rbind(oe_scaling_results[keep, ], result)
    write.csv(oe_scaling_results, "osmextract_scaling_results.csv", row.names = FALSE)
    print(result)
}


run_oe_buildings <- function(area_label) run_oe_for("buildings", "buildings", area_label)
run_oe_network   <- function(area_label) run_oe_for("network", "roads", area_label)
# Kamppi (smallest area)
run_oe_buildings("Kamppi")
       task   area       tool seconds peak_mb features status
1 buildings Kamppi osmextract    0.06   349.6      663     ok
# Helsinki region 
run_oe_buildings("Helsinki region")
       task            area       tool seconds peak_mb features status
1 buildings Helsinki region osmextract    3.99  1095.8   176875     ok
# New York City 
run_oe_buildings("New York City")
       task          area       tool seconds peak_mb features status
1 buildings New York City osmextract   26.08  5018.3  1600076     ok
# Paris 
run_oe_buildings("Paris")
       task  area       tool seconds peak_mb features status
1 buildings Paris osmextract    27.2  5575.1  1888259     ok
# Finland 
run_oe_buildings("Finland")
       task    area       tool seconds peak_mb features status
1 buildings Finland osmextract   76.19  7793.7  3146667     ok
# Spain 
run_oe_buildings("Spain")
       task  area       tool seconds peak_mb features status
1 buildings Spain osmextract  154.43 11905.5  5557273     ok

Road network across area sizes#

The same per-area benchmark for the road network (osmextract’s lines layer, highway IS NOT NULL). Each cell records osmextract’s time and peak memory for one area and upserts its row (task = roads) into osmextract_scaling_results.csv. Run the setup cell first.

# Kamppi (smallest area)
run_oe_network("Kamppi")
   task   area       tool seconds peak_mb features status
1 roads Kamppi osmextract    0.07   350.9     4716     ok
# Helsinki region (small city)
run_oe_network("Helsinki region")
   task            area       tool seconds peak_mb features status
1 roads Helsinki region osmextract    3.14  1022.3   296845     ok
# New York City (large city)
run_oe_network("New York City")
   task          area       tool seconds peak_mb features status
1 roads New York City osmextract    6.65  1584.9   669037     ok
# Paris (larger city)
run_oe_network("Paris")
   task  area       tool seconds peak_mb features status
1 roads Paris osmextract    7.34  1503.3   620227     ok
# Finland (country)
run_oe_network("Finland")
   task    area       tool seconds peak_mb features status
1 roads Finland osmextract   32.57  4657.9  2178539     ok
# Spain (largest country)
run_oe_network("Spain")
   task  area       tool seconds peak_mb features status
1 roads Spain osmextract  114.19 10010.6  5716619     ok