Source code for pycancensus.regions

"""
Functions for working with census regions.
"""

import requests
import pandas as pd
from typing import Optional

from .settings import get_api_key
from .utils import validate_dataset
from .cache import get_cached_data, cache_data


[docs] def list_census_regions( dataset: str, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Query the CensusMapper API for available regions in a given dataset. Parameters ---------- dataset : str The dataset to query for available regions (e.g., 'CA16'). use_cache : bool, default True If True, data will be read from local cache if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. If None, uses environment variable. Returns ------- pd.DataFrame DataFrame with columns: - region: The region identifier - name: The name of that region - level: The census aggregation level of that region - pop: The population of that region - municipal_status: Additional identifiers for municipal status - CMA_UID: The identifier for the Census Metropolitan Area (if any) - CD_UID: The identifier for the Census District (if any) Examples -------- >>> import pycancensus as pc >>> regions = pc.list_census_regions("CA16") >>> print(regions.head()) """ dataset = validate_dataset(dataset) if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first if use_cache: cache_key = f"regions_{dataset}" cached_data = get_cached_data(cache_key) if cached_data is not None: if not quiet: print("Reading regions from cache...") return cached_data # Query API using the correct endpoint (same as R cancensus) # R cancensus uses: https://censusmapper.ca/data_sets/{dataset}/place_names.csv url = f"https://censusmapper.ca/data_sets/{dataset}/place_names.csv" try: if not quiet: print(f"Querying CensusMapper API for {dataset} regions...") # The endpoint returns gzip-compressed CSV data response = requests.get(url, timeout=30) response.raise_for_status() # Parse CSV response import io df = pd.read_csv(io.StringIO(response.text)) # Map column names to match expected output format # CSV columns: name, geo_uid, type, population, flag, CMA_UID, CD_UID, PR_UID # Expected: region, name, level, pop, municipal_status, CMA_UID, CD_UID, PR_UID column_mapping = { "geo_uid": "region", "type": "level", "population": "pop", "flag": "municipal_status", } df = df.rename(columns=column_mapping) # Cache the result if use_cache: cache_data(cache_key, df) if not quiet: print(f"Retrieved {len(df)} regions") return df except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") except Exception as e: raise RuntimeError(f"Failed to process API response: {e}")
[docs] def search_census_regions( search_term: str, dataset: str, level: Optional[str] = None, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Search for census regions by name. Parameters ---------- search_term : str Term to search for in region names. dataset : str The dataset to search in (e.g., 'CA16'). level : str, optional Filter by census aggregation level. use_cache : bool, default True If True, uses cached region list if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. Returns ------- pd.DataFrame Filtered DataFrame of regions matching the search term. Examples -------- >>> import pycancensus as pc >>> vancouver_regions = pc.search_census_regions("Vancouver", "CA16") >>> toronto_cmas = pc.search_census_regions("Toronto", "CA16", level="CMA") """ # Get all regions first regions_df = list_census_regions( dataset=dataset, use_cache=use_cache, quiet=quiet, api_key=api_key ) # Filter by search term (case-insensitive) mask = regions_df["name"].str.contains(search_term, case=False, na=False) filtered_df = regions_df[mask].copy() # Filter by level if specified if level is not None: level_mask = filtered_df["level"] == level filtered_df = filtered_df[level_mask].copy() if not quiet and len(filtered_df) > 0: print(f"Found {len(filtered_df)} regions matching '{search_term}'") elif not quiet: print(f"No regions found matching '{search_term}'") return filtered_df