Skip to main content
This guide will walk you through retrieving data from the GSS StatsBank API using the 2021 Population and Housing Census (PHC) as an example.

Step 1: Connect to the API

Start by defining the base URL and initializing a request:
library(httr2)
library(tidyverse)

URL = "https://statsbank.statsghana.gov.gh:443/api/v1/en/"
request = request(URL)

Step 2: Create a Helper Function

To navigate the API’s hierarchical structure efficiently, create a helper function that builds URLs and shows what’s available at each level:
build_url = function(URL, ...) {
    path = list(...)
    req = request(URL)
    
    # Add each folder name to the URL path incrementally
    full_req = purrr::reduce(path, req_url_path_append, .init = req)
    
    # Check if the last part of the path ends in ".px" (indicating a table)
    is_table = FALSE

    if (length(path) > 0) {
      if (grepl("\\.px$", path[[length(path)]], ignore.case = TRUE)) {
        is_table = TRUE
      }
    }

    # Fetch the content to see what is inside (GET request)
    response = req_perform(full_req)
    body = resp_body_json(response)

    if (is_table) {
        # If it is a table, print the variable names (metadata)
        message("Endpoint reached: Table found.")
        message("Available variables:")
        print(map_chr(body$variables, "code"))
    } else {
        # If it is a folder/database, list the children IDs
        key = if(length(path) == 0) "dbid" else "id"
        print(map_chr(body, key))
    }

    # Return the request object to be assigned to a variable
    return(full_req) 
}
This function automatically detects whether you’ve reached a table (ends in .px) or a folder, displaying the appropriate information to help you navigate.

Step 3: Navigate to a Table

Use the helper function to explore the database structure:
1

List available databases

build_url(URL)
This returns databases like “PHC 2021 StatsBank”, “GLSS7”, “Trade”, etc.
2

Explore a database

build_url(URL, "PHC 2021 StatsBank")
This shows topics like “Water and Sanitation”, “Education and Literacy”, “Population”, etc.
3

Find tables in a topic

build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation")
This lists tables like waterDisposal_table.px, mainwater_table.px, etc.
4

Connect to a specific table

table_req = build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation", "waterDisposal_table.px")
This displays available variables: WaterDisposal, Locality, Geographic_Area
Save the final request as table_req so you can reuse it without retyping the full path.

Step 4: Construct a Query

Define what data you want to retrieve using a JSON query structure:
query_list = list(
  query = list(
    list(
      code = "WaterDisposal",
      selection = list(
        filter = "item",
        values = list(
          "Total households in occupied dwelling unit",
          "Through the sewerage system"
        )
      )
    ),
    list(
       code = "Geographic_Area",
       selection = list(filter = "all", values = list("*"))
    )
  ),
  response = list(
    format = "csv"
  )
)
Use filter = "item" to select specific values, or filter = "all" with values = list("*") to select everything.

Finding Available Values

Not sure what values are available for a variable? Inspect the table metadata:
# Fetch the metadata for the table
metadata = table_req |> 
  req_perform() |> 
  resp_body_json()

# Find the variable you're interested in
variable_info = metadata$variables |> 
  purrr::keep(~ .x$code == "WaterDisposal") |> 
  purrr::flatten()

# Print the available options 
tibble(
  Code = unlist(variable_info$values)
)
This returns options like:
  • “Total households in occupied dwelling unit”
  • “Through the sewerage system”
  • “Thrown onto the ground/street/outside”
  • “Through drainage into a pit (soak away)“

Step 5: Retrieve and Parse the Data

Execute your query and transform the response into a clean dataframe:
# Attach the query and perform the request
response = table_req |> 
  req_body_json(query_list) |> 
  req_perform()

# Parse and clean the data
data = response |> 
  resp_body_raw() |> 
  readr::read_csv() |> 
  tidyr::pivot_longer(
    cols = -1, 
    names_to = "Geographic_Area", 
    values_to = "count" 
  ) |>
  tidyr::pivot_wider(
    names_from = 1, 
    values_from = count
  ) |> 
  rename(
    total_households = `Total households in occupied dwelling unit`,
    sewerage_disposal = `Through the sewerage system`
  )

head(data, n=10)
# A tibble: 10 × 3
   Geographic_Area                           total_households sewerage_disposal
   <chr>                                                <dbl>             <dbl>
 1 Ghana                                              8356779            192638
 2 Western                                             620785              9238
 3 Jomoro Municipal                                     39438               418
 4 Ellembelle                                           37206               651
 5 Nzema East Municipal                                 28386               116
 6 Ahanta West Municipal                                47288               581
 7 Effia Kwesimintsim Municipal                         53717              1650
 8 Sekondi Takoradi Metropolitan Area (STMA)            73876              1406
 9 STMA-Takoradi                                        16271               323
10 STMA-Sekondi                                         17137               404
The API returns data as a POST request response. Make sure to use req_body_json() before calling req_perform().

Complete Example

Here’s the full script from start to finish:
library(httr2)
library(tidyverse)

# --- 1. Setup Helper Function ---
build_url = function(URL, ...) {
    path = list(...)
    req = request(URL)
    full_req = purrr::reduce(path, req_url_path_append, .init = req)
    is_table = FALSE
    if (length(path) > 0) {
      if (grepl("\\.px$", path[[length(path)]], ignore.case = TRUE)) {
        is_table = TRUE
      }
    }
    response = req_perform(full_req)
    body = resp_body_json(response)
    if (is_table) {
        message("Endpoint reached: Table found.")
        message("Available variables:")
        print(map_chr(body$variables, "code"))
    } else {
        key = if(length(path) == 0) "dbid" else "id"
        print(map_chr(body, key))
    }
    return(full_req) 
}

# --- 2. Connect and Navigate ---
URL = "https://statsbank.statsghana.gov.gh:443/api/v1/en/"
table_req = build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation", "waterDisposal_table.px")

# --- 3. Define the Query ---
query_list = list(
  query = list(
    list(
      code = "WaterDisposal",
      selection = list(
        filter = "item",
        values = list(
          "Total households in occupied dwelling unit",
          "Through the sewerage system")
      )
    ),
    list(
       code = "Geographic_Area",
       selection = list(filter = "all", values = list("*"))
    )
  ),
  response = list(
    format = "csv"
  )
)

# --- 4. Fetch and Clean Data ---
response = table_req |> 
  req_body_json(query_list) |> 
  req_perform()

data = response |> 
  resp_body_raw() |> 
  readr::read_csv() |> 
  tidyr::pivot_longer(
    cols = -1, 
    names_to = "Geographic_Area", 
    values_to = "count" 
  ) |>
  tidyr::pivot_wider(
    names_from = 1, 
    values_from = count
  ) |> 
  rename(
    total_households = `Total households in occupied dwelling unit`,
    sewerage_disposal = `Through the sewerage system`
  )

head(data, n=10)

Next Steps

Now that you’ve successfully retrieved your first dataset, you can:
  • Explore other databases and tables in the StatsBank
  • Modify queries to retrieve different variables or filters
  • Integrate this workflow into your data analysis pipeline
  • Automate regular data updates using scheduled R scripts
Save your build_url() function in a separate R file and source it across projects for consistent API navigation.

Build docs developers (and LLMs) love