"""
Caching functionality for pycancensus.
"""
import os
import pickle
import hashlib
from pathlib import Path
from typing import Any, Optional, List
import pandas as pd
import geopandas as gpd
from .settings import get_cache_path
def get_cached_data(cache_key: str) -> Optional[Any]:
"""
Retrieve data from cache if it exists.
Parameters
----------
cache_key : str
Unique identifier for the cached data.
Returns
-------
Any or None
Cached data if found, None otherwise.
"""
cache_path = Path(get_cache_path())
cache_file = cache_path / f"{cache_key}.pkl"
if cache_file.exists():
try:
with open(cache_file, "rb") as f:
return pickle.load(f)
except Exception:
# If cache file is corrupted, remove it
cache_file.unlink(missing_ok=True)
return None
def cache_data(cache_key: str, data: Any) -> None:
"""
Cache data to disk.
Parameters
----------
cache_key : str
Unique identifier for the data.
data : Any
Data to cache.
"""
cache_path = Path(get_cache_path())
cache_path.mkdir(parents=True, exist_ok=True)
cache_file = cache_path / f"{cache_key}.pkl"
try:
with open(cache_file, "wb") as f:
pickle.dump(data, f)
except Exception as e:
print(f"Warning: Failed to cache data: {e}")
[docs]
def list_cache() -> pd.DataFrame:
"""
List all cached data files.
Returns
-------
pd.DataFrame
DataFrame with information about cached files including:
- cache_key: The cache key
- file_path: Full path to cached file
- size_mb: File size in MB
- created: Creation timestamp
- modified: Last modification timestamp
Examples
--------
>>> import pycancensus as pc
>>> cache_list = pc.list_cache()
>>> print(cache_list)
"""
cache_path = Path(get_cache_path())
if not cache_path.exists():
return pd.DataFrame(
columns=["cache_key", "file_path", "size_mb", "created", "modified"]
)
cache_files = []
for cache_file in cache_path.glob("*.pkl"):
try:
stat = cache_file.stat()
cache_files.append(
{
"cache_key": cache_file.stem,
"file_path": str(cache_file),
"size_mb": round(stat.st_size / (1024 * 1024), 2),
"created": pd.Timestamp.fromtimestamp(stat.st_ctime),
"modified": pd.Timestamp.fromtimestamp(stat.st_mtime),
}
)
except Exception:
continue
return pd.DataFrame(cache_files)
[docs]
def remove_from_cache(
cache_keys: Optional[List[str]] = None, all_cache: bool = False
) -> None:
"""
Remove items from cache.
Parameters
----------
cache_keys : list of str, optional
Specific cache keys to remove. If None and all_cache=False,
does nothing.
all_cache : bool, default False
If True, removes all cached data.
Examples
--------
>>> import pycancensus as pc
>>> # Remove specific cache entries
>>> pc.remove_from_cache(["regions_CA16", "vectors_CA16"])
>>>
>>> # Remove all cache (use with caution!)
>>> pc.remove_from_cache(all_cache=True)
"""
cache_path = Path(get_cache_path())
if not cache_path.exists():
print("No cache directory found.")
return
removed_count = 0
if all_cache:
# Remove all .pkl files
for cache_file in cache_path.glob("*.pkl"):
try:
cache_file.unlink()
removed_count += 1
except Exception as e:
print(f"Warning: Failed to remove {cache_file}: {e}")
print(f"Removed {removed_count} cached files.")
elif cache_keys:
# Remove specific cache keys
for cache_key in cache_keys:
cache_file = cache_path / f"{cache_key}.pkl"
if cache_file.exists():
try:
cache_file.unlink()
removed_count += 1
print(f"Removed cache: {cache_key}")
except Exception as e:
print(f"Warning: Failed to remove {cache_key}: {e}")
else:
print(f"Cache key not found: {cache_key}")
if removed_count > 0:
print(f"Removed {removed_count} cached files.")
else:
print("No cache keys specified and all_cache=False. Nothing to remove.")
[docs]
def clear_cache() -> None:
"""
Clear all cached data.
This is an alias for remove_from_cache(all_cache=True).
Examples
--------
>>> import pycancensus as pc
>>> pc.clear_cache()
"""
remove_from_cache(all_cache=True)