Source code for pycancensus.datasets

"""
Functions for working with census datasets.
"""

import re
from typing import List, Optional

import pandas as pd
import requests

from .settings import get_api_key, CENSUSMAPPER_API_URL
from .resilience import get_session
from .cache import get_cached_data, cache_data



[docs]
def list_census_datasets(
    use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None
) -> pd.DataFrame:
    """
    Query the CensusMapper API for available datasets.

    Parameters
    ----------
    use_cache : bool, default True
        If True, data will be read from local cache if available.
    quiet : bool, default False
        When True, suppress messages and warnings.
    api_key : str, optional
        API key for CensusMapper API. If None, uses environment variable.

    Returns
    -------
    pd.DataFrame
        DataFrame with information about available census datasets including:
        - dataset: Dataset identifier (e.g., 'CA16', 'CA21')
        - description: Human-readable description of the dataset
        - geo_dataset: Geographic dataset identifier
        - attribution: Attribution requirements for the dataset

    Examples
    --------
    >>> import pycancensus as pc
    >>> datasets = pc.list_census_datasets()
    >>> print(datasets)
    """
    if api_key is None:
        api_key = get_api_key()
        if api_key is None:
            raise ValueError(
                "API key required. Set with set_api_key() or CANCENSUS_API_KEY "
                "environment variable."
            )

    # Check cache first
    if use_cache:
        cache_key = "datasets"
        cached_data = get_cached_data(cache_key)
        if cached_data is not None:
            if not quiet:
                print("Reading datasets from cache...")
            return cached_data

    # Query API
    params = {"api_key": api_key, "format": "json"}

    try:
        if not quiet:
            print("Querying CensusMapper API for available datasets...")

        response = get_session().get(
            f"{CENSUSMAPPER_API_URL}/list_datasets", params=params
        )

        data = response.json()

        # API returns a list directly, not a dict with "datasets" key
        if isinstance(data, list):
            df = pd.DataFrame(data)
        elif isinstance(data, dict) and "datasets" in data:
            # Fallback for alternative API response format
            df = pd.DataFrame(data["datasets"])
        else:
            raise ValueError(
                "Invalid API response: expected list of datasets or dict with 'datasets' field"
            )

        # Cache the result
        if use_cache:
            cache_data(cache_key, df)

        if not quiet:
            print(f"Retrieved {len(df)} datasets")

        return df

    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"API request failed: {e}")
    except Exception as e:
        raise RuntimeError(f"Failed to process API response: {e}")



def get_dataset_attribution(dataset: str) -> str:
    """
    Get the required attribution text for a dataset.

    Parameters
    ----------
    dataset : str
        Dataset identifier (e.g., 'CA16').

    Returns
    -------
    str
        Attribution text that should be included when using the dataset.

    Examples
    --------
    >>> import pycancensus as pc
    >>> attribution = pc.get_dataset_attribution("CA16")
    >>> print(attribution)
    """
    datasets_df = list_census_datasets(quiet=True)

    dataset_row = datasets_df[datasets_df["dataset"] == dataset.upper()]

    if len(dataset_row) == 0:
        raise ValueError(f"Dataset {dataset} not found")

    attribution = dataset_row.iloc[0].get("attribution", "")

    if not attribution:
        # Default attribution text
        attribution = (
            "Source: Statistics Canada, Census Profile. "
            "Reproduced and distributed on an 'as is' basis with the "
            "permission of Statistics Canada."
        )

    return attribution



[docs]
def dataset_attribution(datasets):
    """
    Get combined attribution text for multiple datasets.

    This function combines attribution text for multiple datasets, merging
    similar attributions that only differ by year.

    Parameters
    ----------
    datasets : list of str
        List of dataset identifiers (e.g., ['CA06', 'CA16']).

    Returns
    -------
    list of str
        List of attribution strings, with similar attributions merged.

    Examples
    --------
    >>> import pycancensus as pc
    >>> # Get attribution for multiple census years
    >>> attributions = pc.dataset_attribution(['CA06', 'CA16'])
    >>> for attr in attributions:
    ...     print(attr)
    """
    # Get all datasets info
    datasets_df = list_census_datasets(quiet=True)

    # Filter for requested datasets
    datasets = [d.upper() for d in datasets]
    dataset_rows = datasets_df[datasets_df["dataset"].isin(datasets)]

    if len(dataset_rows) == 0:
        raise ValueError(f"No valid datasets found in {datasets}")

    # Get attribution texts
    attributions = dataset_rows["attribution"].tolist()

    # Group similar attributions that differ only by year
    # Create a mapping of pattern to actual attributions
    pattern_map = {}

    for attr in attributions:
        # Replace 4-digit years with placeholder to create pattern
        pattern = re.sub(r"\d{4}", "{{YEAR}}", attr)

        if pattern not in pattern_map:
            pattern_map[pattern] = []
        pattern_map[pattern].append(attr)

    # For each pattern, merge the years
    result = []
    for pattern, attr_list in pattern_map.items():
        if len(attr_list) == 1:
            # Only one attribution with this pattern
            result.append(attr_list[0])
        else:
            # Multiple attributions with same pattern - merge years
            # Extract all years from the attributions
            all_years = []
            for attr in attr_list:
                years = re.findall(r"\d{4}", attr)
                all_years.extend(years)

            # Remove duplicates and sort
            unique_years = sorted(list(set(all_years)))

            # Replace {{YEAR}} placeholder with merged years
            if len(unique_years) > 0:
                year_string = ", ".join(unique_years)
                merged = pattern.replace("{{YEAR}}", year_string)
                result.append(merged)
            else:
                # No years found, just use first attribution
                result.append(attr_list[0])

    return result