Source code for pycancensus.datasets

"""
Functions for working with census datasets.
"""

import re
from typing import List, Optional

import pandas as pd
import requests

from .settings import get_api_key, CENSUSMAPPER_API_URL
from .cache import get_cached_data, cache_data


[docs] def list_census_datasets( use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None ) -> pd.DataFrame: """ Query the CensusMapper API for available datasets. Parameters ---------- use_cache : bool, default True If True, data will be read from local cache if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. If None, uses environment variable. Returns ------- pd.DataFrame DataFrame with information about available census datasets including: - dataset: Dataset identifier (e.g., 'CA16', 'CA21') - description: Human-readable description of the dataset - geo_dataset: Geographic dataset identifier - attribution: Attribution requirements for the dataset Examples -------- >>> import pycancensus as pc >>> datasets = pc.list_census_datasets() >>> print(datasets) """ if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first if use_cache: cache_key = "datasets" cached_data = get_cached_data(cache_key) if cached_data is not None: if not quiet: print("Reading datasets from cache...") return cached_data # Query API params = {"api_key": api_key, "format": "json"} try: if not quiet: print("Querying CensusMapper API for available datasets...") response = requests.get( f"{CENSUSMAPPER_API_URL}/list_datasets", params=params, timeout=30 ) response.raise_for_status() data = response.json() # API returns a list directly, not a dict with "datasets" key if isinstance(data, list): df = pd.DataFrame(data) elif isinstance(data, dict) and "datasets" in data: # Fallback for alternative API response format df = pd.DataFrame(data["datasets"]) else: raise ValueError( "Invalid API response: expected list of datasets or dict with 'datasets' field" ) # Cache the result if use_cache: cache_data(cache_key, df) if not quiet: print(f"Retrieved {len(df)} datasets") return df except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") except Exception as e: raise RuntimeError(f"Failed to process API response: {e}")
def get_dataset_attribution(dataset: str) -> str: """ Get the required attribution text for a dataset. Parameters ---------- dataset : str Dataset identifier (e.g., 'CA16'). Returns ------- str Attribution text that should be included when using the dataset. Examples -------- >>> import pycancensus as pc >>> attribution = pc.get_dataset_attribution("CA16") >>> print(attribution) """ datasets_df = list_census_datasets(quiet=True) dataset_row = datasets_df[datasets_df["dataset"] == dataset.upper()] if len(dataset_row) == 0: raise ValueError(f"Dataset {dataset} not found") attribution = dataset_row.iloc[0].get("attribution", "") if not attribution: # Default attribution text attribution = ( "Source: Statistics Canada, Census Profile. " "Reproduced and distributed on an 'as is' basis with the " "permission of Statistics Canada." ) return attribution
[docs] def dataset_attribution(datasets): """ Get combined attribution text for multiple datasets. This function combines attribution text for multiple datasets, merging similar attributions that only differ by year. Parameters ---------- datasets : list of str List of dataset identifiers (e.g., ['CA06', 'CA16']). Returns ------- list of str List of attribution strings, with similar attributions merged. Examples -------- >>> import pycancensus as pc >>> # Get attribution for multiple census years >>> attributions = pc.dataset_attribution(['CA06', 'CA16']) >>> for attr in attributions: ... print(attr) """ # Get all datasets info datasets_df = list_census_datasets(quiet=True) # Filter for requested datasets datasets = [d.upper() for d in datasets] dataset_rows = datasets_df[datasets_df["dataset"].isin(datasets)] if len(dataset_rows) == 0: raise ValueError(f"No valid datasets found in {datasets}") # Get attribution texts attributions = dataset_rows["attribution"].tolist() # Group similar attributions that differ only by year # Create a mapping of pattern to actual attributions pattern_map = {} for attr in attributions: # Replace 4-digit years with placeholder to create pattern pattern = re.sub(r"\d{4}", "{{YEAR}}", attr) if pattern not in pattern_map: pattern_map[pattern] = [] pattern_map[pattern].append(attr) # For each pattern, merge the years result = [] for pattern, attr_list in pattern_map.items(): if len(attr_list) == 1: # Only one attribution with this pattern result.append(attr_list[0]) else: # Multiple attributions with same pattern - merge years # Extract all years from the attributions all_years = [] for attr in attr_list: years = re.findall(r"\d{4}", attr) all_years.extend(years) # Remove duplicates and sort unique_years = sorted(list(set(all_years))) # Replace {{YEAR}} placeholder with merged years if len(unique_years) > 0: year_string = ", ".join(unique_years) merged = pattern.replace("{{YEAR}}", year_string) result.append(merged) else: # No years found, just use first attribution result.append(attr_list[0]) return result