Source code for pycancensus.vectors

"""
Functions for working with census vectors (variables).
"""

import io
import warnings
from typing import Optional

import pandas as pd
import requests

from .settings import get_api_key, CENSUSMAPPER_API_URL
from .utils import validate_dataset
from .cache import get_cached_data, cache_data


[docs] def label_vectors(x): """ Return Census variable names and labels from a data frame. This function extracts the census_vectors attribute from a DataFrame returned by get_census() when labels='short'. The attribute contains a mapping of truncated vector names to their detailed descriptions. Parameters ---------- x : pd.DataFrame or gpd.GeoDataFrame A data frame returned from get_census() with labels='short'. Returns ------- pd.DataFrame or None A DataFrame with columns 'Vector' (short code) and 'Detail' (full description), or None if no vector metadata is available. Examples -------- >>> import pycancensus as pc >>> # Get census data with short labels >>> data = pc.get_census( ... dataset='CA21', ... regions={'CMA': '35535'}, ... vectors=['v_CA21_1', 'v_CA21_2'], ... level='CSD', ... labels='short' ... ) >>> # Get the vector label details >>> labels = pc.label_vectors(data) >>> print(labels) """ if hasattr(x, "attrs") and "census_vectors" in x.attrs: # Convert stored dict back to DataFrame metadata = x.attrs["census_vectors"] if isinstance(metadata, list): # Stored as list of dicts return pd.DataFrame(metadata) else: # Already a DataFrame (legacy) return metadata else: warnings.warn( "Data does not have variables to labels. No Census variables selected " "as vectors or data was not retrieved with labels='short'. " "See get_census() for more information." ) return None
[docs] def list_census_vectors( dataset: str, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Query the CensusMapper API for available vectors for a given dataset. Parameters ---------- dataset : str The dataset to query for available vectors (e.g., 'CA16'). use_cache : bool, default True If True, data will be read from local cache if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. If None, uses environment variable. Returns ------- pd.DataFrame DataFrame with columns: - vector: Short code for the variable - type: Whether it's a female, male, or total aggregate - label: Name of the variable - units: Whether the value represents a numeric integer, percentage, dollar figure, or ratio - parent_vector: Hierarchical relationship - aggregation: Whether the value is additive or a transformation - details: Detailed description generated by traversing all labels within its hierarchical structure Examples -------- >>> import pycancensus as pc >>> vectors = pc.list_census_vectors("CA16") >>> print(vectors.head()) """ dataset = validate_dataset(dataset) if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first if use_cache: cache_key = f"vectors_{dataset}" cached_data = get_cached_data(cache_key) if cached_data is not None: if not quiet: print("Reading vectors from cache...") return cached_data # Query API using the correct endpoint (discovered via diagnostics) params = {"dataset": dataset, "api_key": api_key} try: if not quiet: print(f"🔍 Querying CensusMapper API for {dataset} vectors...") # Use the working CSV endpoint instead of the non-working JSON endpoint response = requests.get( f"{CENSUSMAPPER_API_URL}/vector_info.csv", params=params, timeout=30 ) response.raise_for_status() # Parse CSV response df = pd.read_csv(io.StringIO(response.text)) # Rename columns to match expected format column_mapping = { "vector": "vector", "label": "label", "type": "type", "units": "units", "add": "aggregation", "parent": "parent_vector", "details": "details", } # Apply column mapping if columns exist for old_col, new_col in column_mapping.items(): if old_col in df.columns and old_col != new_col: df = df.rename(columns={old_col: new_col}) # Ensure required columns exist required_columns = ["vector", "label", "type"] for col in required_columns: if col not in df.columns: raise ValueError(f"Missing required column: {col}") # Convert parent_vector column to handle empty strings as None if "parent_vector" in df.columns: df["parent_vector"] = df["parent_vector"].replace(["", "NA"], None) # Cache the result if use_cache: cache_data(cache_key, df) if not quiet: print(f"✅ Retrieved {len(df)} vectors for {dataset}") if len(df) > 1000: print(f"📊 Large dataset: {len(df)} variables available") return df except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") except Exception as e: raise RuntimeError(f"Failed to process API response: {e}")
[docs] def search_census_vectors( search_term: str, dataset: str, type_filter: Optional[str] = None, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Search for census vectors by name or description. Parameters ---------- search_term : str Term to search for in vector labels or details. dataset : str The dataset to search in (e.g., 'CA16'). type_filter : str, optional Filter by vector type ('Total', 'Male', 'Female'). use_cache : bool, default True If True, uses cached vector list if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. Returns ------- pd.DataFrame Filtered DataFrame of vectors matching the search term. Examples -------- >>> import pycancensus as pc >>> income_vectors = pc.search_census_vectors("income", "CA16") >>> total_pop = pc.search_census_vectors("population", "CA16", type_filter="Total") """ # Get all vectors first vectors_df = list_census_vectors( dataset=dataset, use_cache=use_cache, quiet=quiet, api_key=api_key ) # Search in both label and details columns (case-insensitive) label_mask = vectors_df["label"].str.contains(search_term, case=False, na=False) details_mask = ( vectors_df["details"].str.contains(search_term, case=False, na=False) if "details" in vectors_df.columns else pd.Series([False] * len(vectors_df)) ) mask = label_mask | details_mask filtered_df = vectors_df[mask].copy() # Filter by type if specified if type_filter is not None: type_mask = filtered_df["type"] == type_filter filtered_df = filtered_df[type_mask].copy() if not quiet and len(filtered_df) > 0: print(f"Found {len(filtered_df)} vectors matching '{search_term}'") elif not quiet: print(f"No vectors found matching '{search_term}'") return filtered_df
def find_census_vectors( search_term: str, dataset: str, type_filter: Optional[str] = None, interactive: bool = False, use_cache: bool = True, quiet: bool = False, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Find census vectors with enhanced search capabilities. This is an alias for search_census_vectors with potential for future enhancement with fuzzy matching and interactive selection. Parameters ---------- search_term : str Term to search for in vector labels or details. dataset : str The dataset to search in (e.g., 'CA16'). type_filter : str, optional Filter by vector type ('Total', 'Male', 'Female'). interactive : bool, default False If True, provides interactive selection (future enhancement). use_cache : bool, default True If True, uses cached vector list if available. quiet : bool, default False When True, suppress messages and warnings. api_key : str, optional API key for CensusMapper API. Returns ------- pd.DataFrame Filtered DataFrame of vectors matching the search term. """ if interactive: # TODO: Implement interactive vector selection print("Interactive mode not yet implemented. Using standard search.") return search_census_vectors( search_term=search_term, dataset=dataset, type_filter=type_filter, use_cache=use_cache, quiet=quiet, api_key=api_key, )