This guide will walk you through retrieving data from the GSS StatsBank API using the 2021 Population and Housing Census (PHC) as an example.
Step 1: Connect to the API
Start by defining the base URL and initializing a request:
library(httr2)
library(tidyverse)
URL = "https://statsbank.statsghana.gov.gh:443/api/v1/en/"
request = request(URL)
Step 2: Create a Helper Function
To navigate the API’s hierarchical structure efficiently, create a helper function that builds URLs and shows what’s available at each level:
build_url = function(URL, ...) {
path = list(...)
req = request(URL)
# Add each folder name to the URL path incrementally
full_req = purrr::reduce(path, req_url_path_append, .init = req)
# Check if the last part of the path ends in ".px" (indicating a table)
is_table = FALSE
if (length(path) > 0) {
if (grepl("\\.px$", path[[length(path)]], ignore.case = TRUE)) {
is_table = TRUE
}
}
# Fetch the content to see what is inside (GET request)
response = req_perform(full_req)
body = resp_body_json(response)
if (is_table) {
# If it is a table, print the variable names (metadata)
message("Endpoint reached: Table found.")
message("Available variables:")
print(map_chr(body$variables, "code"))
} else {
# If it is a folder/database, list the children IDs
key = if(length(path) == 0) "dbid" else "id"
print(map_chr(body, key))
}
# Return the request object to be assigned to a variable
return(full_req)
}
This function automatically detects whether you’ve reached a table (ends in .px) or a folder, displaying the appropriate information to help you navigate.
Step 3: Navigate to a Table
Use the helper function to explore the database structure:
List available databases
This returns databases like “PHC 2021 StatsBank”, “GLSS7”, “Trade”, etc. Explore a database
build_url(URL, "PHC 2021 StatsBank")
This shows topics like “Water and Sanitation”, “Education and Literacy”, “Population”, etc.Find tables in a topic
build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation")
This lists tables like waterDisposal_table.px, mainwater_table.px, etc.Connect to a specific table
table_req = build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation", "waterDisposal_table.px")
This displays available variables: WaterDisposal, Locality, Geographic_Area
Save the final request as table_req so you can reuse it without retyping the full path.
Step 4: Construct a Query
Define what data you want to retrieve using a JSON query structure:
query_list = list(
query = list(
list(
code = "WaterDisposal",
selection = list(
filter = "item",
values = list(
"Total households in occupied dwelling unit",
"Through the sewerage system"
)
)
),
list(
code = "Geographic_Area",
selection = list(filter = "all", values = list("*"))
)
),
response = list(
format = "csv"
)
)
Use filter = "item" to select specific values, or filter = "all" with values = list("*") to select everything.
Finding Available Values
Not sure what values are available for a variable? Inspect the table metadata:
# Fetch the metadata for the table
metadata = table_req |>
req_perform() |>
resp_body_json()
# Find the variable you're interested in
variable_info = metadata$variables |>
purrr::keep(~ .x$code == "WaterDisposal") |>
purrr::flatten()
# Print the available options
tibble(
Code = unlist(variable_info$values)
)
This returns options like:
- “Total households in occupied dwelling unit”
- “Through the sewerage system”
- “Thrown onto the ground/street/outside”
- “Through drainage into a pit (soak away)“
Step 5: Retrieve and Parse the Data
Execute your query and transform the response into a clean dataframe:
# Attach the query and perform the request
response = table_req |>
req_body_json(query_list) |>
req_perform()
# Parse and clean the data
data = response |>
resp_body_raw() |>
readr::read_csv() |>
tidyr::pivot_longer(
cols = -1,
names_to = "Geographic_Area",
values_to = "count"
) |>
tidyr::pivot_wider(
names_from = 1,
values_from = count
) |>
rename(
total_households = `Total households in occupied dwelling unit`,
sewerage_disposal = `Through the sewerage system`
)
head(data, n=10)
# A tibble: 10 × 3
Geographic_Area total_households sewerage_disposal
<chr> <dbl> <dbl>
1 Ghana 8356779 192638
2 Western 620785 9238
3 Jomoro Municipal 39438 418
4 Ellembelle 37206 651
5 Nzema East Municipal 28386 116
6 Ahanta West Municipal 47288 581
7 Effia Kwesimintsim Municipal 53717 1650
8 Sekondi Takoradi Metropolitan Area (STMA) 73876 1406
9 STMA-Takoradi 16271 323
10 STMA-Sekondi 17137 404
The API returns data as a POST request response. Make sure to use req_body_json() before calling req_perform().
Complete Example
Here’s the full script from start to finish:
library(httr2)
library(tidyverse)
# --- 1. Setup Helper Function ---
build_url = function(URL, ...) {
path = list(...)
req = request(URL)
full_req = purrr::reduce(path, req_url_path_append, .init = req)
is_table = FALSE
if (length(path) > 0) {
if (grepl("\\.px$", path[[length(path)]], ignore.case = TRUE)) {
is_table = TRUE
}
}
response = req_perform(full_req)
body = resp_body_json(response)
if (is_table) {
message("Endpoint reached: Table found.")
message("Available variables:")
print(map_chr(body$variables, "code"))
} else {
key = if(length(path) == 0) "dbid" else "id"
print(map_chr(body, key))
}
return(full_req)
}
# --- 2. Connect and Navigate ---
URL = "https://statsbank.statsghana.gov.gh:443/api/v1/en/"
table_req = build_url(URL, "PHC 2021 StatsBank", "Water and Sanitation", "waterDisposal_table.px")
# --- 3. Define the Query ---
query_list = list(
query = list(
list(
code = "WaterDisposal",
selection = list(
filter = "item",
values = list(
"Total households in occupied dwelling unit",
"Through the sewerage system")
)
),
list(
code = "Geographic_Area",
selection = list(filter = "all", values = list("*"))
)
),
response = list(
format = "csv"
)
)
# --- 4. Fetch and Clean Data ---
response = table_req |>
req_body_json(query_list) |>
req_perform()
data = response |>
resp_body_raw() |>
readr::read_csv() |>
tidyr::pivot_longer(
cols = -1,
names_to = "Geographic_Area",
values_to = "count"
) |>
tidyr::pivot_wider(
names_from = 1,
values_from = count
) |>
rename(
total_households = `Total households in occupied dwelling unit`,
sewerage_disposal = `Through the sewerage system`
)
head(data, n=10)
Next Steps
Now that you’ve successfully retrieved your first dataset, you can:
- Explore other databases and tables in the StatsBank
- Modify queries to retrieve different variables or filters
- Integrate this workflow into your data analysis pipeline
- Automate regular data updates using scheduled R scripts
Save your build_url() function in a separate R file and source it across projects for consistent API navigation.