"""
Functions for working with census vectors (variables).
"""
import requests
import pandas as pd
from typing import Optional
from .settings import get_api_key
from .utils import validate_dataset
from .cache import get_cached_data, cache_data
[docs]
def label_vectors(x):
"""
Return Census variable names and labels from a data frame.
This function extracts the census_vectors attribute from a DataFrame
returned by get_census() when labels='short'. The attribute contains
a mapping of truncated vector names to their detailed descriptions.
Parameters
----------
x : pd.DataFrame or gpd.GeoDataFrame
A data frame returned from get_census() with labels='short'.
Returns
-------
pd.DataFrame or None
A DataFrame with columns 'Vector' (short code) and 'Detail'
(full description), or None if no vector metadata is available.
Examples
--------
>>> import pycancensus as pc
>>> # Get census data with short labels
>>> data = pc.get_census(
... dataset='CA21',
... regions={'CMA': '35535'},
... vectors=['v_CA21_1', 'v_CA21_2'],
... level='CSD',
... labels='short'
... )
>>> # Get the vector label details
>>> labels = pc.label_vectors(data)
>>> print(labels)
"""
import warnings
if hasattr(x, "attrs") and "census_vectors" in x.attrs:
return x.attrs["census_vectors"]
else:
warnings.warn(
"Data does not have variables to labels. No Census variables selected "
"as vectors or data was not retrieved with labels='short'. "
"See get_census() for more information."
)
return None
[docs]
def list_census_vectors(
dataset: str,
use_cache: bool = True,
quiet: bool = False,
api_key: Optional[str] = None,
) -> pd.DataFrame:
"""
Query the CensusMapper API for available vectors for a given dataset.
Parameters
----------
dataset : str
The dataset to query for available vectors (e.g., 'CA16').
use_cache : bool, default True
If True, data will be read from local cache if available.
quiet : bool, default False
When True, suppress messages and warnings.
api_key : str, optional
API key for CensusMapper API. If None, uses environment variable.
Returns
-------
pd.DataFrame
DataFrame with columns:
- vector: Short code for the variable
- type: Whether it's a female, male, or total aggregate
- label: Name of the variable
- units: Whether the value represents a numeric integer, percentage,
dollar figure, or ratio
- parent_vector: Hierarchical relationship
- aggregation: Whether the value is additive or a transformation
- details: Detailed description generated by traversing all labels
within its hierarchical structure
Examples
--------
>>> import pycancensus as pc
>>> vectors = pc.list_census_vectors("CA16")
>>> print(vectors.head())
"""
dataset = validate_dataset(dataset)
if api_key is None:
api_key = get_api_key()
if api_key is None:
raise ValueError(
"API key required. Set with set_api_key() or CANCENSUS_API_KEY "
"environment variable."
)
# Check cache first
if use_cache:
cache_key = f"vectors_{dataset}"
cached_data = get_cached_data(cache_key)
if cached_data is not None:
if not quiet:
print("Reading vectors from cache...")
return cached_data
# Query API using the correct endpoint (discovered via diagnostics)
base_url = "https://censusmapper.ca/api/v1"
params = {"dataset": dataset, "api_key": api_key}
try:
if not quiet:
print(f"🔍 Querying CensusMapper API for {dataset} vectors...")
# Use the working CSV endpoint instead of the non-working JSON endpoint
response = requests.get(
f"{base_url}/vector_info.csv", params=params, timeout=30
)
response.raise_for_status()
# Parse CSV response
import io
df = pd.read_csv(io.StringIO(response.text))
# Rename columns to match expected format
column_mapping = {
"vector": "vector",
"label": "label",
"type": "type",
"units": "units",
"add": "aggregation",
"parent": "parent_vector",
"details": "details",
}
# Apply column mapping if columns exist
for old_col, new_col in column_mapping.items():
if old_col in df.columns and old_col != new_col:
df = df.rename(columns={old_col: new_col})
# Ensure required columns exist
required_columns = ["vector", "label", "type"]
for col in required_columns:
if col not in df.columns:
raise ValueError(f"Missing required column: {col}")
# Convert parent_vector column to handle empty strings as None
if "parent_vector" in df.columns:
df["parent_vector"] = df["parent_vector"].replace(["", "NA"], None)
# Cache the result
if use_cache:
cache_data(cache_key, df)
if not quiet:
print(f"✅ Retrieved {len(df)} vectors for {dataset}")
if len(df) > 1000:
print(f"📊 Large dataset: {len(df)} variables available")
return df
except requests.exceptions.RequestException as e:
raise RuntimeError(f"API request failed: {e}")
except Exception as e:
raise RuntimeError(f"Failed to process API response: {e}")
[docs]
def search_census_vectors(
search_term: str,
dataset: str,
type_filter: Optional[str] = None,
use_cache: bool = True,
quiet: bool = False,
api_key: Optional[str] = None,
) -> pd.DataFrame:
"""
Search for census vectors by name or description.
Parameters
----------
search_term : str
Term to search for in vector labels or details.
dataset : str
The dataset to search in (e.g., 'CA16').
type_filter : str, optional
Filter by vector type ('Total', 'Male', 'Female').
use_cache : bool, default True
If True, uses cached vector list if available.
quiet : bool, default False
When True, suppress messages and warnings.
api_key : str, optional
API key for CensusMapper API.
Returns
-------
pd.DataFrame
Filtered DataFrame of vectors matching the search term.
Examples
--------
>>> import pycancensus as pc
>>> income_vectors = pc.search_census_vectors("income", "CA16")
>>> total_pop = pc.search_census_vectors("population", "CA16", type_filter="Total")
"""
# Get all vectors first
vectors_df = list_census_vectors(
dataset=dataset, use_cache=use_cache, quiet=quiet, api_key=api_key
)
# Search in both label and details columns (case-insensitive)
label_mask = vectors_df["label"].str.contains(search_term, case=False, na=False)
details_mask = (
vectors_df["details"].str.contains(search_term, case=False, na=False)
if "details" in vectors_df.columns
else pd.Series([False] * len(vectors_df))
)
mask = label_mask | details_mask
filtered_df = vectors_df[mask].copy()
# Filter by type if specified
if type_filter is not None:
type_mask = filtered_df["type"] == type_filter
filtered_df = filtered_df[type_mask].copy()
if not quiet and len(filtered_df) > 0:
print(f"Found {len(filtered_df)} vectors matching '{search_term}'")
elif not quiet:
print(f"No vectors found matching '{search_term}'")
return filtered_df
def find_census_vectors(
search_term: str,
dataset: str,
type_filter: Optional[str] = None,
interactive: bool = False,
use_cache: bool = True,
quiet: bool = False,
api_key: Optional[str] = None,
) -> pd.DataFrame:
"""
Find census vectors with enhanced search capabilities.
This is an alias for search_census_vectors with potential for future
enhancement with fuzzy matching and interactive selection.
Parameters
----------
search_term : str
Term to search for in vector labels or details.
dataset : str
The dataset to search in (e.g., 'CA16').
type_filter : str, optional
Filter by vector type ('Total', 'Male', 'Female').
interactive : bool, default False
If True, provides interactive selection (future enhancement).
use_cache : bool, default True
If True, uses cached vector list if available.
quiet : bool, default False
When True, suppress messages and warnings.
api_key : str, optional
API key for CensusMapper API.
Returns
-------
pd.DataFrame
Filtered DataFrame of vectors matching the search term.
"""
if interactive:
# TODO: Implement interactive vector selection
print("Interactive mode not yet implemented. Using standard search.")
return search_census_vectors(
search_term=search_term,
dataset=dataset,
type_filter=type_filter,
use_cache=use_cache,
quiet=quiet,
api_key=api_key,
)