Source code for pycancensus.vectors

"""
Functions for working with census vectors (variables).
"""

import io
import re
import warnings
from typing import Optional

import pandas as pd
import requests

from .settings import get_api_key, CENSUSMAPPER_API_URL
from .resilience import CensusAPIError, NetworkError, get_session
from .utils import validate_dataset
from .cache import get_cached_data, cache_data, session_cache_get, session_cache_set



[docs]
def label_vectors(x):
    """
    Return Census variable names and labels from a data frame.

    This function extracts the census_vectors attribute from a DataFrame
    returned by get_census() when labels='short'. The attribute contains
    a mapping of truncated vector names to their detailed descriptions.

    Parameters
    ----------
    x : pd.DataFrame or gpd.GeoDataFrame
        A data frame returned from get_census() with labels='short'.

    Returns
    -------
    pd.DataFrame or None
        A DataFrame with columns 'Vector' (short code) and 'Detail'
        (full description), or None if no vector metadata is available.

    Examples
    --------
    >>> import pycancensus as pc
    >>> # Get census data with short labels
    >>> data = pc.get_census(
    ...     dataset='CA21',
    ...     regions={'CMA': '35535'},
    ...     vectors=['v_CA21_1', 'v_CA21_2'],
    ...     level='CSD',
    ...     labels='short'
    ... )
    >>> # Get the vector label details
    >>> labels = pc.label_vectors(data)
    >>> print(labels)
    """
    if hasattr(x, "attrs") and "census_vectors" in x.attrs:
        # Convert stored dict back to DataFrame
        metadata = x.attrs["census_vectors"]
        if isinstance(metadata, list):
            # Stored as list of dicts
            return pd.DataFrame(metadata)
        else:
            # Already a DataFrame (legacy)
            return metadata
    else:
        warnings.warn(
            "Data does not have variables to labels. No Census variables selected "
            "as vectors or data was not retrieved with labels='short'. "
            "See get_census() for more information."
        )
        return None




[docs]
def list_census_vectors(
    dataset: str,
    use_cache: bool = True,
    quiet: bool = False,
    api_key: Optional[str] = None,
) -> pd.DataFrame:
    """
    Query the CensusMapper API for available vectors for a given dataset.

    Parameters
    ----------
    dataset : str
        The dataset to query for available vectors (e.g., 'CA16').
    use_cache : bool, default True
        If True, data will be read from local cache if available.
    quiet : bool, default False
        When True, suppress messages and warnings.
    api_key : str, optional
        API key for CensusMapper API. If None, uses environment variable.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns:
        - vector: Short code for the variable
        - type: Whether it's a female, male, or total aggregate
        - label: Name of the variable
        - units: Whether the value represents a numeric integer, percentage,
                dollar figure, or ratio
        - parent_vector: Hierarchical relationship
        - aggregation: Whether the value is additive or a transformation
        - details: Detailed description generated by traversing all labels
                  within its hierarchical structure

    Examples
    --------
    >>> import pycancensus as pc
    >>> vectors = pc.list_census_vectors("CA16")
    >>> print(vectors.head())
    """
    dataset = validate_dataset(dataset)

    if api_key is None:
        api_key = get_api_key()
        if api_key is None:
            raise ValueError(
                "API key required. Set with set_api_key() or CANCENSUS_API_KEY "
                "environment variable."
            )

    # Check caches first: in-memory session cache, then file cache
    cache_key = f"vectors_{dataset}"
    if use_cache:
        cached_data = session_cache_get(cache_key)
        if cached_data is not None:
            return cached_data
        cached_data = get_cached_data(cache_key)
        if cached_data is not None:
            if not quiet:
                print("Reading vectors from cache...")
            session_cache_set(cache_key, cached_data)
            return cached_data

    # Dataset is a path component, matching the R package:
    # /api/v1/vector_info/<dataset>.csv (the query-param form returns 404)
    params = {"api_key": api_key}

    try:
        if not quiet:
            print(f"🔍 Querying CensusMapper API for {dataset} vectors...")

        response = get_session().get(
            f"{CENSUSMAPPER_API_URL}/vector_info/{dataset}.csv", params=params
        )

        # Parse CSV response
        df = pd.read_csv(io.StringIO(response.text))

        # Rename columns to match expected format
        column_mapping = {
            "vector": "vector",
            "label": "label",
            "type": "type",
            "units": "units",
            "add": "aggregation",
            "parent": "parent_vector",
            "details": "details",
        }

        # Apply column mapping if columns exist
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns and old_col != new_col:
                df = df.rename(columns={old_col: new_col})

        # Ensure required columns exist
        required_columns = ["vector", "label", "type"]
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")

        # Convert parent_vector column to handle empty strings as None
        if "parent_vector" in df.columns:
            df["parent_vector"] = df["parent_vector"].replace(["", "NA"], None)

        # Cache the result
        session_cache_set(cache_key, df)
        if use_cache:
            cache_data(cache_key, df)

        if not quiet:
            print(f"✅ Retrieved {len(df)} vectors for {dataset}")
            if len(df) > 1000:
                print(f"📊 Large dataset: {len(df)} variables available")

        return df

    except CensusAPIError:
        raise
    except requests.exceptions.RequestException as e:
        raise NetworkError(str(e)) from e
    except Exception as e:
        raise RuntimeError(f"Failed to process API response: {e}") from e




[docs]
def search_census_vectors(
    search_term: str,
    dataset: str,
    type_filter: Optional[str] = None,
    use_cache: bool = True,
    quiet: bool = False,
    api_key: Optional[str] = None,
) -> pd.DataFrame:
    """
    Search for census vectors by name or description.

    Parameters
    ----------
    search_term : str
        Term to search for in vector labels or details.
    dataset : str
        The dataset to search in (e.g., 'CA16').
    type_filter : str, optional
        Filter by vector type ('Total', 'Male', 'Female').
    use_cache : bool, default True
        If True, uses cached vector list if available.
    quiet : bool, default False
        When True, suppress messages and warnings.
    api_key : str, optional
        API key for CensusMapper API.

    Returns
    -------
    pd.DataFrame
        Filtered DataFrame of vectors matching the search term.

    Examples
    --------
    >>> import pycancensus as pc
    >>> income_vectors = pc.search_census_vectors("income", "CA16")
    >>> total_pop = pc.search_census_vectors("population", "CA16", type_filter="Total")
    """
    # Get all vectors first
    vectors_df = list_census_vectors(
        dataset=dataset, use_cache=use_cache, quiet=quiet, api_key=api_key
    )

    # Search in both label and details columns (case-insensitive, literal —
    # regex metacharacters in queries like "income ($)" match literally)
    label_mask = vectors_df["label"].str.contains(
        search_term, case=False, na=False, regex=False
    )
    details_mask = (
        vectors_df["details"].str.contains(
            search_term, case=False, na=False, regex=False
        )
        if "details" in vectors_df.columns
        else pd.Series([False] * len(vectors_df))
    )

    mask = label_mask | details_mask
    filtered_df = vectors_df[mask].copy()

    # Filter by type if specified
    if type_filter is not None:
        type_mask = filtered_df["type"] == type_filter
        filtered_df = filtered_df[type_mask].copy()

    if not quiet and len(filtered_df) > 0:
        print(f"Found {len(filtered_df)} vectors matching '{search_term}'")
    elif not quiet:
        print(f"No vectors found matching '{search_term}'")

    return filtered_df




[docs]
def find_census_vectors(
    query: str,
    dataset: str,
    type: str = "all",
    query_type: str = "exact",
    interactive: bool = False,
    use_cache: bool = True,
    quiet: bool = False,
    api_key: Optional[str] = None,
) -> pd.DataFrame:
    """
    Find census vectors using exact, semantic, or keyword search.

    Mirrors R cancensus's find_census_vectors(). Exact search matches the
    query literally against vector details. Semantic search tolerates
    spelling and phrasing differences via n-gram edit-distance matching.
    Keyword search splits the query into words and ranks vectors by how
    many of them match.

    Parameters
    ----------
    query : str
        Search query.
    dataset : str
        The dataset to search in (e.g., 'CA16').
    type : str, default "all"
        Filter by vector type: 'all', 'total', 'male', or 'female'.
    query_type : str, default "exact"
        One of 'exact', 'semantic', or 'keyword'.
    interactive : bool, default False
        For keyword search: prompt to show lower-precision matches beyond
        the top-ranked results.
    use_cache : bool, default True
        If True, uses cached vector list if available.
    quiet : bool, default False
        When True, suppress messages and warnings.
    api_key : str, optional
        API key for CensusMapper API.

    Returns
    -------
    pd.DataFrame
        Matching vectors with columns vector, type, label, details.

    Examples
    --------
    >>> import pycancensus as pc
    >>> pc.find_census_vectors('Oji-cree', dataset='CA16', type='total')
    >>> pc.find_census_vectors('after tax income', 'CA16', query_type='semantic')
    >>> pc.find_census_vectors('commute duration', 'CA16', query_type='keyword')
    """
    type = type.lower()
    query_type = query_type.lower()

    if type not in ("total", "male", "female", "all"):
        raise ValueError(
            "Type must be one of 'all', 'total', 'female', or 'male'. "
            "See help(find_census_vectors) for more details."
        )
    if query_type not in ("exact", "semantic", "keyword"):
        raise ValueError(
            "Query type must be one of 'exact', 'semantic', or 'keyword'. "
            "See help(find_census_vectors) for more details."
        )

    vector_list = list_census_vectors(
        dataset, use_cache=use_cache, quiet=True, api_key=api_key
    )[["vector", "type", "label", "details"]].copy()
    # Strip the common "... Census; 100% data;" prefix from details, like R
    vector_list["details"] = vector_list["details"].str.replace(
        r"^(.*)Census; |100% data; ", "", regex=True
    )

    if type in ("total", "male", "female"):
        vector_list = vector_list[vector_list["type"] == type.title()]

    if query_type == "exact":
        mask = vector_list["details"].str.contains(
            query, case=False, na=False, regex=False
        )
        result = vector_list[mask]
        if result.empty:
            warnings.warn(
                "No exact matches found. Please check spelling and try again "
                "or consider using semantic or keyword search.\n"
                "See help(find_census_vectors) for more details."
            )
        return result
    elif query_type == "semantic":
        return _semantic_search(query, vector_list, quiet=quiet)
    else:  # keyword
        return _keyword_search(query, vector_list, interactive=interactive)



def _bounded_levenshtein(a: str, b: str, max_dist: int = 2) -> int:
    """Levenshtein distance, returning max_dist + 1 once it exceeds max_dist."""
    if abs(len(a) - len(b)) > max_dist:
        return max_dist + 1
    if a == b:
        return 0
    previous = list(range(len(b) + 1))
    for i, ca in enumerate(a, start=1):
        current = [i]
        row_min = i
        for j, cb in enumerate(b, start=1):
            cost = min(
                previous[j] + 1,  # deletion
                current[j - 1] + 1,  # insertion
                previous[j - 1] + (ca != cb),  # substitution
            )
            current.append(cost)
            row_min = min(row_min, cost)
        if row_min > max_dist:
            return max_dist + 1
        previous = current
    return previous[-1]


def _clean_text(text: str) -> str:
    """Lowercase, replace punctuation with spaces, collapse whitespace."""
    return re.sub(r"\s+", " ", re.sub(r"[^\w\s]", " ", text.lower())).strip()


def _semantic_search(
    query: str, vector_list: pd.DataFrame, quiet: bool = False
) -> pd.DataFrame:
    """N-gram edit-distance search, mirroring R cancensus semantic_search()."""
    details = vector_list["details"].fillna("")
    clean_details = [_clean_text(d) for d in details]

    query_words = [w for w in re.split(r"[^a-z]+", query.lower()) if w]
    word_count = max(len(query_words), 1)

    # Build word-count-length n-grams (suffix n-grams at sentence ends)
    ngram_counts: dict = {}
    for sentence in clean_details:
        words = sentence.split()
        if not words:
            continue
        if word_count == 1:
            grams = words
        else:
            grams = [" ".join(words[i : i + word_count]) for i in range(len(words))]
        for gram in grams:
            ngram_counts[gram] = ngram_counts.get(gram, 0) + 1
    if not ngram_counts:
        raise ValueError("No census vector details available to search against.")

    # Most frequent n-grams first, matching R's table() ordering
    ordered_ngrams = [g for g, _ in sorted(ngram_counts.items(), key=lambda kv: -kv[1])]

    # Best match for the full query plus each individual query word
    revised_query = [query.lower()] + query.lower().split()
    best_ngrams = []
    overall_min = None
    for term in revised_query:
        term_best_dist = None
        term_best_gram = None
        for gram in ordered_ngrams:
            # Levenshtein is bounded below by the length difference; skip
            # candidates that can never come within the match threshold
            if abs(len(gram) - len(term)) > 2:
                continue
            dist = _bounded_levenshtein(term, gram, max_dist=2)
            if term_best_dist is None or dist < term_best_dist:
                term_best_dist = dist
                term_best_gram = gram
                if dist == 0:
                    break
        if term_best_dist is not None and term_best_dist <= 2:
            if term_best_gram not in best_ngrams:
                best_ngrams.append(term_best_gram)
        if term_best_dist is not None:
            overall_min = (
                term_best_dist
                if overall_min is None
                else min(overall_min, term_best_dist)
            )

    if not best_ngrams:
        warnings.warn(
            "No close matches found. Please check spelling and try again or "
            "consider using keyword search instead.\n"
            "See help(find_census_vectors) for more details."
        )
        return vector_list.iloc[0:0]

    pattern = "|".join(re.escape(g) for g in best_ngrams)
    matched = [
        bool(re.search(pattern, clean, re.IGNORECASE)) for clean in clean_details
    ]
    result = vector_list[matched]
    if len(result) > 1 and not quiet:
        print("Multiple possible matches. Results ordered by closeness.")
    return result


def _keyword_search(
    query: str, vector_list: pd.DataFrame, interactive: bool = False
) -> pd.DataFrame:
    """Unigram match-count search, mirroring R cancensus keyword_search()."""
    details = vector_list["details"].fillna("")
    # Deduplicate words within each detail string so repeated words don't
    # inflate the match count
    clean_details = []
    for d in details:
        words = _clean_text(d).split()
        seen: set = set()
        unique_words = [w for w in words if not (w in seen or seen.add(w))]
        clean_details.append(" ".join(unique_words))

    # Drop empty tokens (e.g. from queries starting with a digit); an empty
    # regex alternative would match every vector
    query_tokens = [t for t in re.split(r"[^a-z]+", query.lower()) if t]
    if not query_tokens:
        warnings.warn(
            "No matches found. Please check spelling and try again or "
            "consider using semantic search instead.\n"
            "See help(find_census_vectors) for more details."
        )
        return vector_list.iloc[0:0]

    word_pattern = re.compile(
        r"\b(?:" + "|".join(re.escape(t) for t in query_tokens) + r")\b",
        re.IGNORECASE,
    )
    n_matches = pd.Series(
        [len(word_pattern.findall(clean)) for clean in clean_details],
        index=vector_list.index,
    )

    if (n_matches == 0).all():
        warnings.warn(
            "No matches found. Please check spelling and try again or "
            "consider using semantic search instead.\n"
            "See help(find_census_vectors) for more details."
        )
        return vector_list.iloc[0:0]

    max_matches = n_matches.max()
    top_res = vector_list[n_matches == max_matches]
    other_res = vector_list[(n_matches > 0) & (n_matches < max_matches)]

    if other_res.empty or not interactive:
        return top_res

    print(top_res)
    answer = input(
        f"\nThere are {len(other_res)} additional keyword matches with less "
        "precision. Show more? [y/N] "
    )
    if answer.strip().lower().startswith("y"):
        return other_res
    print(f"Showing top {len(top_res)} results only")
    return top_res



[docs]
def explore_census_vectors(dataset: str = "CA16") -> None:
    """
    Open the interactive CensusMapper variable explorer in a browser.

    Parameters
    ----------
    dataset : str, default "CA16"
        The dataset to explore vectors for.
    """
    import webbrowser

    dataset = validate_dataset(dataset)
    print(
        "Opening interactive census variable explorer at censusmapper.ca/api "
        "in the browser"
    )
    webbrowser.open(f"https://censusmapper.ca/api/{dataset}#api_variable")