Source code for pycancensus.hierarchy

"""Vector hierarchy navigation functions for pycancensus."""

import re
import warnings
from typing import Dict, List, Optional, Union

import pandas as pd
import requests

from .settings import get_api_key
from .utils import validate_dataset
from .cache import get_cached_data, cache_data


[docs] def parent_census_vectors( vectors: Union[str, List[str]], dataset: Optional[str] = None, use_cache: bool = True, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Get parent vectors for given child vectors. Parameters ---------- vectors : str or list of str Vector IDs to find parents for dataset : str, optional Dataset to search in. If None, inferred from vectors use_cache : bool, default True Whether to use cached data if available api_key : str, optional API key for CensusMapper API Returns ------- pd.DataFrame DataFrame with parent vector information """ # Ensure vectors is a list if isinstance(vectors, str): vectors = [vectors] if not vectors: return pd.DataFrame() # Infer dataset if not provided if dataset is None: try: dataset = vectors[0].split("_")[1] except (IndexError, AttributeError): raise ValueError("Dataset must be specified or inferable from vectors") dataset = validate_dataset(dataset) if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first cache_key = f"parent_vectors_{dataset}_{'-'.join(sorted(vectors))}" if use_cache: cached_data = get_cached_data(cache_key) if cached_data is not None: return cached_data # Get all vectors for the dataset to build hierarchy from .vectors import list_census_vectors try: all_vectors = list_census_vectors(dataset, use_cache=use_cache, api_key=api_key) except Exception as e: warnings.warn(f"Could not retrieve vector list for hierarchy: {e}") return pd.DataFrame() # Filter for parent vectors parent_vectors = [] for vector in vectors: if "parent_vector" in all_vectors.columns: # Find direct parents matches = all_vectors[all_vectors["vector"] == vector] if not matches.empty and pd.notna(matches.iloc[0]["parent_vector"]): parent_id = matches.iloc[0]["parent_vector"] parent_info = all_vectors[all_vectors["vector"] == parent_id] if not parent_info.empty: parent_vectors.append(parent_info.iloc[0].to_dict()) else: # Fallback: try to infer parent from vector naming patterns parent_candidate = _infer_parent_vector(vector, all_vectors) if parent_candidate is not None: parent_vectors.append(parent_candidate) result = pd.DataFrame(parent_vectors).drop_duplicates() # Cache the result if use_cache and not result.empty: cache_data(cache_key, result) return result
[docs] def child_census_vectors( vectors: Union[str, List[str]], dataset: Optional[str] = None, use_cache: bool = True, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Get child vectors for given parent vectors. Parameters ---------- vectors : str or list of str Parent vector IDs dataset : str, optional Dataset to search in use_cache : bool, default True Whether to use cached data if available api_key : str, optional API key for CensusMapper API Returns ------- pd.DataFrame DataFrame with child vector information """ # Ensure vectors is a list if isinstance(vectors, str): vectors = [vectors] if not vectors: return pd.DataFrame() # Infer dataset if not provided if dataset is None: try: dataset = vectors[0].split("_")[1] except (IndexError, AttributeError): raise ValueError("Dataset must be specified or inferable from vectors") dataset = validate_dataset(dataset) if api_key is None: api_key = get_api_key() if api_key is None: raise ValueError( "API key required. Set with set_api_key() or CANCENSUS_API_KEY " "environment variable." ) # Check cache first cache_key = f"child_vectors_{dataset}_{'-'.join(sorted(vectors))}" if use_cache: cached_data = get_cached_data(cache_key) if cached_data is not None: return cached_data # Get all vectors for the dataset to build hierarchy from .vectors import list_census_vectors try: all_vectors = list_census_vectors(dataset, use_cache=use_cache, api_key=api_key) except Exception as e: warnings.warn(f"Could not retrieve vector list for hierarchy: {e}") return pd.DataFrame() # Filter for child vectors child_vectors = [] for vector in vectors: if "parent_vector" in all_vectors.columns: # Find direct children children = all_vectors[all_vectors["parent_vector"] == vector] for _, child in children.iterrows(): child_vectors.append(child.to_dict()) else: # Fallback: try to infer children from vector naming patterns children_candidates = _infer_child_vectors(vector, all_vectors) child_vectors.extend(children_candidates) result = pd.DataFrame(child_vectors).drop_duplicates() # Cache the result if use_cache and not result.empty: cache_data(cache_key, result) return result
[docs] def find_census_vectors( dataset: str, query: str, search_type: str = "keyword", use_cache: bool = True, api_key: Optional[str] = None, ) -> pd.DataFrame: """ Enhanced vector search with multiple search types. Parameters ---------- dataset : str Dataset to search in query : str Search query search_type : str, default "keyword" Type of search: "keyword", "exact", "regex" use_cache : bool, default True Whether to use cached data if available api_key : str, optional API key for CensusMapper API Returns ------- pd.DataFrame Matching vectors with relevance information """ dataset = validate_dataset(dataset) # Get all vectors for the dataset from .vectors import list_census_vectors try: all_vectors = list_census_vectors(dataset, use_cache=use_cache, api_key=api_key) except Exception as e: warnings.warn(f"Could not retrieve vector list for search: {e}") return pd.DataFrame() if all_vectors.empty: return pd.DataFrame() # Perform search based on type query_lower = query.lower() if search_type == "exact": # Exact match in label or vector ID mask = (all_vectors["vector"].str.lower() == query_lower) | ( all_vectors["label"].str.lower() == query_lower ) elif search_type == "regex": # Regex search try: pattern = re.compile(query, re.IGNORECASE) mask = all_vectors["label"].str.contains(pattern, na=False) | all_vectors[ "vector" ].str.contains(pattern, na=False) if "details" in all_vectors.columns: mask |= all_vectors["details"].str.contains(pattern, na=False) except re.error: warnings.warn(f"Invalid regex pattern: {query}") return pd.DataFrame() else: # keyword search (default) # Keyword search in label and details mask = all_vectors["label"].str.contains(query, case=False, na=False) if "details" in all_vectors.columns: mask |= all_vectors["details"].str.contains(query, case=False, na=False) result = all_vectors[mask].copy() if not result.empty: # Add relevance scoring result["relevance_score"] = 0.0 # Higher score for matches in vector ID vector_match = result["vector"].str.contains(query, case=False, na=False) result.loc[vector_match, "relevance_score"] += 10 # Higher score for matches in label label_match = result["label"].str.contains(query, case=False, na=False) result.loc[label_match, "relevance_score"] += 5 # Sort by relevance score result = result.sort_values("relevance_score", ascending=False) return result
def _infer_parent_vector(vector: str, all_vectors: pd.DataFrame) -> Optional[Dict]: """ Infer parent vector from naming patterns. This is a fallback when explicit parent_vector column is not available. """ # Extract the numeric part of the vector ID match = re.match(r"(v_[A-Z0-9]+_)(\d+)", vector) if not match: return None prefix, number = match.groups() vector_num = int(number) # Look for parent patterns (shorter vector numbers often indicate parents) for potential_parent_num in range(1, vector_num): potential_parent = f"{prefix}{potential_parent_num}" parent_match = all_vectors[all_vectors["vector"] == potential_parent] if not parent_match.empty: # Check if this could be a reasonable parent based on naming parent_label = parent_match.iloc[0].get("label", "").lower() current_label = ( all_vectors[all_vectors["vector"] == vector]["label"].iloc[0].lower() ) # Simple heuristic: if parent label is contained in current label if parent_label and parent_label in current_label: return parent_match.iloc[0].to_dict() return None def _infer_child_vectors(vector: str, all_vectors: pd.DataFrame) -> List[Dict]: """ Infer child vectors from naming patterns. This is a fallback when explicit parent_vector column is not available. """ match = re.match(r"(v_[A-Z0-9]+_)(\d+)", vector) if not match: return [] prefix, number = match.groups() vector_num = int(number) # Get current vector label for comparison current_match = all_vectors[all_vectors["vector"] == vector] if current_match.empty: return [] current_label = current_match.iloc[0].get("label", "").lower() children = [] # Look for child patterns (higher vector numbers that might be children) max_search = min( vector_num + 1000, all_vectors["vector"].str.extract(r"v_[A-Z0-9]+_(\d+)")[0].astype(int).max(), ) for potential_child_num in range(vector_num + 1, max_search + 1): potential_child = f"{prefix}{potential_child_num}" child_match = all_vectors[all_vectors["vector"] == potential_child] if not child_match.empty: child_label = child_match.iloc[0].get("label", "").lower() # Simple heuristic: if current label is contained in child label if current_label and current_label in child_label: children.append(child_match.iloc[0].to_dict()) # Limit to reasonable number of children if len(children) >= 50: break return children