Source code for pycancensus.vectors

"""
Functions for working with census vectors (variables).
"""

import requests
import pandas as pd
from typing import Optional

from .settings import get_api_key
from .utils import validate_dataset
from .cache import get_cached_data, cache_data


[docs] def label_vectors(x): """ Return Census variable names and labels from a data frame. This function extracts the census_vectors attribute from a DataFrame returned by get_census() when labels='short'. The attribute contains a mapping of truncated vector names to their detailed descriptions. Parameters ---------- x : pd.DataFrame or gpd.GeoDataFrame A data frame returned from get_census() with labels='short'. Returns ------- pd.DataFrame or None A DataFrame with columns 'Vector' (short code) and 'Detail' (full description), or None if no vector metadata is available. Examples -------- >>> import pycancensus as pc >>> # Get census data with short labels >>> data = pc.get_census( ... dataset='CA21', ... regions={'CMA': '35535'}, ... vectors=['v_CA21_1', 'v_CA21_2'], ... level='CSD', ... labels='short' ... ) >>> # Get the vector label details >>> labels = pc.label_vectors(data) >>> print(labels) """ import warnings if hasattr(x, "attrs") and "census_vectors" in x.attrs: return x.attrs["census_vectors"] else: warnings.warn( "Data does not have variables to labels. No Census variables selected " "as vectors or data was not retrieved with labels='short'. " "See get_census() for more information." ) return None
[docs] def list_census_vectors( dataset: str, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Query the CensusMapper API for available vectors for a given dataset. Parameters ---------- dataset : str The dataset to query for available vectors (e.g., 'CA16'). use_cache : bool, default True If True, data will be read from local cache if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. If None, uses environment variable. Returns ------- pd.DataFrame DataFrame with columns: - vector: Short code for the variable - type: Whether it's a female, male, or total aggregate - label: Name of the variable - units: Whether the value represents a numeric integer, percentage, dollar figure, or ratio - parent_vector: Hierarchical relationship - aggregation: Whether the value is additive or a transformation - details: Detailed description generated by traversing all labels within its hierarchical structure Examples -------- >>> import pycancensus as pc >>> vectors = pc.list_census_vectors("CA16") >>> print(vectors.head()) """ dataset = validate_dataset(dataset) if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first if use_cache: cache_key = f"vectors_{dataset}" cached_data = get_cached_data(cache_key) if cached_data is not None: if not quiet: print("Reading vectors from cache...") return cached_data # Query API using the correct endpoint (discovered via diagnostics) base_url = "https://censusmapper.ca/api/v1" params = {"dataset": dataset, "api_key": api_key} try: if not quiet: print(f"🔍 Querying CensusMapper API for {dataset} vectors...") # Use the working CSV endpoint instead of the non-working JSON endpoint response = requests.get( f"{base_url}/vector_info.csv", params=params, timeout=30 ) response.raise_for_status() # Parse CSV response import io df = pd.read_csv(io.StringIO(response.text)) # Rename columns to match expected format column_mapping = { "vector": "vector", "label": "label", "type": "type", "units": "units", "add": "aggregation", "parent": "parent_vector", "details": "details", } # Apply column mapping if columns exist for old_col, new_col in column_mapping.items(): if old_col in df.columns and old_col != new_col: df = df.rename(columns={old_col: new_col}) # Ensure required columns exist required_columns = ["vector", "label", "type"] for col in required_columns: if col not in df.columns: raise ValueError(f"Missing required column: {col}") # Convert parent_vector column to handle empty strings as None if "parent_vector" in df.columns: df["parent_vector"] = df["parent_vector"].replace(["", "NA"], None) # Cache the result if use_cache: cache_data(cache_key, df) if not quiet: print(f"✅ Retrieved {len(df)} vectors for {dataset}") if len(df) > 1000: print(f"📊 Large dataset: {len(df)} variables available") return df except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") except Exception as e: raise RuntimeError(f"Failed to process API response: {e}")
[docs] def search_census_vectors( search_term: str, dataset: str, type_filter: Optional[str] = None, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Search for census vectors by name or description. Parameters ---------- search_term : str Term to search for in vector labels or details. dataset : str The dataset to search in (e.g., 'CA16'). type_filter : str, optional Filter by vector type ('Total', 'Male', 'Female'). use_cache : bool, default True If True, uses cached vector list if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. Returns ------- pd.DataFrame Filtered DataFrame of vectors matching the search term. Examples -------- >>> import pycancensus as pc >>> income_vectors = pc.search_census_vectors("income", "CA16") >>> total_pop = pc.search_census_vectors("population", "CA16", type_filter="Total") """ # Get all vectors first vectors_df = list_census_vectors( dataset=dataset, use_cache=use_cache, quiet=quiet, api_key=api_key ) # Search in both label and details columns (case-insensitive) label_mask = vectors_df["label"].str.contains(search_term, case=False, na=False) details_mask = ( vectors_df["details"].str.contains(search_term, case=False, na=False) if "details" in vectors_df.columns else pd.Series([False] * len(vectors_df)) ) mask = label_mask | details_mask filtered_df = vectors_df[mask].copy() # Filter by type if specified if type_filter is not None: type_mask = filtered_df["type"] == type_filter filtered_df = filtered_df[type_mask].copy() if not quiet and len(filtered_df) > 0: print(f"Found {len(filtered_df)} vectors matching '{search_term}'") elif not quiet: print(f"No vectors found matching '{search_term}'") return filtered_df
def find_census_vectors( search_term: str, dataset: str, type_filter: Optional[str] = None, interactive: bool = False, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Find census vectors with enhanced search capabilities. This is an alias for search_census_vectors with potential for future enhancement with fuzzy matching and interactive selection. Parameters ---------- search_term : str Term to search for in vector labels or details. dataset : str The dataset to search in (e.g., 'CA16'). type_filter : str, optional Filter by vector type ('Total', 'Male', 'Female'). interactive : bool, default False If True, provides interactive selection (future enhancement). use_cache : bool, default True If True, uses cached vector list if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. Returns ------- pd.DataFrame Filtered DataFrame of vectors matching the search term. """ if interactive: # TODO: Implement interactive vector selection print("Interactive mode not yet implemented. Using standard search.") return search_census_vectors( search_term=search_term, dataset=dataset, type_filter=type_filter, use_cache=use_cache, quiet=quiet, api_key=api_key, )