diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..0afd182a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,11 @@ +- bump: minor + changes: + added: + - Name-based seeding (seeded_rng) for order-independent reproducibility + - State-specific Medicaid takeup rates (53%-99% range, 51 jurisdictions) + - SSI resource test pass rate parameter (0.4) + - WIC takeup and nutritional risk draw variables (float) + - meets_ssi_resource_test boolean generation + changed: + - Replaced shared RNG (seed=100) with per-variable name-based seeding + - Medicaid takeup now uses state-specific rates instead of uniform 93% diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 84f01a8b..90dc3f03 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -14,6 +14,8 @@ ) from microimpute.models.qrf import QRF import logging +from policyengine_us_data.parameters import load_take_up_rate +from policyengine_us_data.utils.randomness import seeded_rng class CPS(Dataset): @@ -191,28 +193,101 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): def add_takeup(self): data = self.load_dataset() - from policyengine_us import system, Microsimulation + from policyengine_us import Microsimulation baseline = Microsimulation(dataset=self) - parameters = baseline.tax_benefit_system.parameters(self.time_period) - generator = np.random.default_rng(seed=100) + n_persons = len(data["person_id"]) + n_tax_units = len(data["tax_unit_id"]) + n_spm_units = len(data["spm_unit_id"]) + + # Load take-up rates + eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) + dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) + snap_rate = load_take_up_rate("snap", self.time_period) + aca_rate = load_take_up_rate("aca", self.time_period) + medicaid_rates_by_state = load_take_up_rate("medicaid", self.time_period) + head_start_rate = load_take_up_rate("head_start", self.time_period) + early_head_start_rate = load_take_up_rate( + "early_head_start", self.time_period + ) + ssi_pass_rate = load_take_up_rate("ssi_pass_rate", self.time_period) - eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup + # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values - eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count) - data["takes_up_eitc"] = ( - generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate + eitc_takeup_rate = np.array( + [ + eitc_rates_by_children.get(min(int(c), 3), 0.85) + for c in eitc_child_count + ] + ) + rng = seeded_rng("takes_up_eitc") + data["takes_up_eitc"] = rng.random(n_tax_units) < eitc_takeup_rate + + # DC Property Tax Credit + rng = seeded_rng("takes_up_dc_ptc") + data["takes_up_dc_ptc"] = rng.random(n_tax_units) < dc_ptc_rate + + # SNAP + rng = seeded_rng("takes_up_snap_if_eligible") + data["takes_up_snap_if_eligible"] = rng.random(n_spm_units) < snap_rate + + # ACA + rng = seeded_rng("takes_up_aca_if_eligible") + data["takes_up_aca_if_eligible"] = rng.random(n_tax_units) < aca_rate + + # Medicaid: state-specific rates + state_codes = baseline.calculate("state_code_str").values + hh_ids = data["household_id"] + person_hh_ids = data["person_household_id"] + hh_to_state = dict(zip(hh_ids, state_codes)) + person_states = np.array( + [hh_to_state.get(hh_id, "CA") for hh_id in person_hh_ids] ) - dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup - data["takes_up_dc_ptc"] = ( - generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate + medicaid_rate_by_person = np.array( + [medicaid_rates_by_state.get(s, 0.93) for s in person_states] + ) + rng = seeded_rng("takes_up_medicaid_if_eligible") + data["takes_up_medicaid_if_eligible"] = ( + rng.random(n_persons) < medicaid_rate_by_person + ) + + # Head Start + rng = seeded_rng("takes_up_head_start_if_eligible") + data["takes_up_head_start_if_eligible"] = ( + rng.random(n_persons) < head_start_rate + ) + + # Early Head Start + rng = seeded_rng("takes_up_early_head_start_if_eligible") + data["takes_up_early_head_start_if_eligible"] = ( + rng.random(n_persons) < early_head_start_rate ) - generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) - data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) + # SSI resource test + rng = seeded_rng("meets_ssi_resource_test") + data["meets_ssi_resource_test"] = rng.random(n_persons) < ssi_pass_rate + + # WIC: resolve draws to bools using category-specific rates + wic_categories = baseline.calculate("wic_category_str").values + wic_takeup_rates = load_take_up_rate("wic_takeup", self.time_period) + wic_takeup_rate_by_person = np.array( + [wic_takeup_rates.get(c, 0) for c in wic_categories] + ) + rng = seeded_rng("would_claim_wic") + data["would_claim_wic"] = rng.random(n_persons) < wic_takeup_rate_by_person + + # WIC nutritional risk — fully resolved + wic_risk_rates = load_take_up_rate( + "wic_nutritional_risk", self.time_period + ) + wic_risk_rate_by_person = np.array( + [wic_risk_rates.get(c, 0) for c in wic_categories] + ) + receives_wic = baseline.calculate("receives_wic").values + rng = seeded_rng("is_wic_at_nutritional_risk") + imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person + data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py new file mode 100644 index 00000000..2fcddb5a --- /dev/null +++ b/policyengine_us_data/parameters/__init__.py @@ -0,0 +1,72 @@ +""" +Take-up rate parameters for stochastic simulation. + +These parameters are stored in the data package to keep the country package +as a purely deterministic rules engine. +""" + +import yaml +from pathlib import Path + +PARAMETERS_DIR = Path(__file__).parent + + +def load_take_up_rate(variable_name: str, year: int = 2018): + """Load take-up rate from YAML parameter files. + + Args: + variable_name: Name of the take-up parameter file (without .yaml) + year: Year for which to get the rate + + Returns: + float, dict (EITC rates_by_children), or dict (Medicaid + rates_by_state) + """ + yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + # EITC: rates by number of children + if "rates_by_children" in data: + return data["rates_by_children"] + + # Medicaid: state-specific rates + if "rates_by_state" in data: + return data["rates_by_state"] + + # WIC-style: rates by category (each category has a time series) + if "rates_by_category" in data: + result = {} + for category, time_series in data["rates_by_category"].items(): + applicable_value = None + for y, value in sorted(time_series.items()): + if int(y) <= year: + applicable_value = value + else: + break + if applicable_value is not None: + result[category] = applicable_value + return result + + # Standard time-series values + values = data["values"] + applicable_value = None + + for date_key, value in sorted(values.items()): + if hasattr(date_key, "year"): + date_year = date_key.year + else: + date_year = int(date_key.split("-")[0]) + + if date_year <= year: + applicable_value = value + else: + break + + if applicable_value is None: + raise ValueError( + f"No take-up rate found for {variable_name} in {year}" + ) + + return applicable_value diff --git a/policyengine_us_data/parameters/take_up/aca.yaml b/policyengine_us_data/parameters/take_up/aca.yaml new file mode 100644 index 00000000..98f92014 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/aca.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible. +metadata: + label: ACA takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.672 diff --git a/policyengine_us_data/parameters/take_up/dc_ptc.yaml b/policyengine_us_data/parameters/take_up/dc_ptc.yaml new file mode 100644 index 00000000..6195ecf3 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/dc_ptc.yaml @@ -0,0 +1,11 @@ +description: The share of eligible individuals who claim the DC property tax credit. +metadata: + unit: /1 + label: DC property tax credit takeup rate + period: year + reference: + - title: District of Columbia Tax Expenditure Report, 2024 + href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234 +values: + # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate) + 2021-01-01: 0.32 diff --git a/policyengine_us_data/parameters/take_up/early_head_start.yaml b/policyengine_us_data/parameters/take_up/early_head_start.yaml new file mode 100644 index 00000000..3802d988 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/early_head_start.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible infants and toddlers who enroll in Early Head Start. +metadata: + label: Early Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.09 diff --git a/policyengine_us_data/parameters/take_up/eitc.yaml b/policyengine_us_data/parameters/take_up/eitc.yaml new file mode 100644 index 00000000..17aa9daa --- /dev/null +++ b/policyengine_us_data/parameters/take_up/eitc.yaml @@ -0,0 +1,12 @@ +description: The share of eligible individuals who claim the EITC (by number of children). +metadata: + label: EITC take-up rate by number of children + reference: + - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS + href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62 +# Maps number of children to take-up rate +rates_by_children: + 0: 0.65 + 1: 0.86 + 2: 0.85 + 3: 0.85 # Assume same as 2 diff --git a/policyengine_us_data/parameters/take_up/head_start.yaml b/policyengine_us_data/parameters/take_up/head_start.yaml new file mode 100644 index 00000000..9495f44b --- /dev/null +++ b/policyengine_us_data/parameters/take_up/head_start.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible children who enroll in Head Start. +metadata: + label: Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.40 + 2021-09-01: 0.30 diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml new file mode 100644 index 00000000..3beb851b --- /dev/null +++ b/policyengine_us_data/parameters/take_up/medicaid.yaml @@ -0,0 +1,64 @@ +description: Percentage of people who do enroll in Medicaid, if eligible. +metadata: + label: Medicaid takeup rate + unit: /1 + period: year + breakdown: + - state_code + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/ + - title: State-specific rates derived from MACPAC enrollment targets vs modeled eligibility + href: https://www.medicaid.gov/medicaid/program-information/medicaid-and-chip-enrollment-data/report-highlights/index.html +rates_by_state: + AK: 0.88 + AL: 0.92 + AR: 0.79 + AZ: 0.95 + CA: 0.78 + CO: 0.99 + CT: 0.89 + DC: 0.99 + DE: 0.86 + FL: 0.98 + GA: 0.73 + HI: 0.88 + IA: 0.84 + ID: 0.78 + IL: 0.85 + IN: 0.99 + KS: 0.92 + KY: 0.87 + LA: 0.79 + MA: 0.94 + MD: 0.95 + ME: 0.92 + MI: 0.91 + MN: 0.89 + MO: 0.89 + MS: 0.75 + MT: 0.83 + NC: 0.94 + ND: 0.91 + NE: 0.79 + NH: 0.84 + NJ: 0.74 + NM: 0.84 + NV: 0.93 + NY: 0.86 + OH: 0.82 + OK: 0.77 + OR: 0.92 + PA: 0.64 + RI: 0.94 + SC: 0.93 + SD: 0.88 + TN: 0.92 + TX: 0.76 + UT: 0.53 + VA: 0.82 + VT: 0.93 + WA: 0.98 + WI: 0.91 + WV: 0.83 + WY: 0.70 diff --git a/policyengine_us_data/parameters/take_up/snap.yaml b/policyengine_us_data/parameters/take_up/snap.yaml new file mode 100644 index 00000000..12b6012e --- /dev/null +++ b/policyengine_us_data/parameters/take_up/snap.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible SNAP recipients who claim SNAP. +metadata: + label: SNAP takeup rate + unit: /1 + reference: + - title: USDA + href: https://www.fns.usda.gov/usamap +values: + 2018-01-01: 0.82 diff --git a/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml b/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml new file mode 100644 index 00000000..415b3c6c --- /dev/null +++ b/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml @@ -0,0 +1,10 @@ +description: Proportion of SSI-aged-blind-disabled recipients who meet the asset test. +metadata: + label: SSI resource test pass rate + unit: /1 + period: year + reference: + - title: SSI resource test pass rate from policyengine-us + href: https://github.com/PolicyEngine/policyengine-us +values: + 2018-01-01: 0.4 diff --git a/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml b/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml new file mode 100644 index 00000000..c327b428 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml @@ -0,0 +1,13 @@ +rates_by_category: + PREGNANT: + 1980: 0.913 + POSTPARTUM: + 1980: 0.933 + BREASTFEEDING: + 1980: 0.889 + INFANT: + 1980: 0.95 + CHILD: + 1980: 0.752 + NONE: + 1980: 0 diff --git a/policyengine_us_data/parameters/take_up/wic_takeup.yaml b/policyengine_us_data/parameters/take_up/wic_takeup.yaml new file mode 100644 index 00000000..ec0f6355 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/wic_takeup.yaml @@ -0,0 +1,33 @@ +rates_by_category: + PREGNANT: + 2018: 0.533 + 2019: 0.523 + 2020: 0.456 + 2021: 0.437 + 2022: 0.456 + POSTPARTUM: + 2018: 0.844 + 2019: 0.847 + 2020: 0.685 + 2021: 0.672 + 2022: 0.689 + BREASTFEEDING: + 2018: 0.687 + 2019: 0.684 + 2020: 0.604 + 2021: 0.608 + 2022: 0.663 + INFANT: + 2018: 0.978 + 2019: 0.984 + 2020: 0.817 + 2021: 0.78 + 2022: 0.784 + CHILD: + 2018: 0.442 + 2019: 0.448 + 2020: 0.406 + 2021: 0.432 + 2022: 0.46 + NONE: + 2018: 0 diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py new file mode 100644 index 00000000..1819241a --- /dev/null +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -0,0 +1,154 @@ +"""Tests for stochastic variable generation in the data package.""" + +import pytest +import numpy as np +from policyengine_us_data.parameters import load_take_up_rate +from policyengine_us_data.utils.randomness import ( + _stable_string_hash, + seeded_rng, +) + + +class TestTakeUpRateParameters: + + def test_eitc_rate_loads(self): + rates = load_take_up_rate("eitc", 2022) + assert isinstance(rates, dict) + for key, rate in rates.items(): + assert 0 < rate <= 1 + + def test_snap_rate_loads(self): + rate = load_take_up_rate("snap", 2022) + assert 0 < rate <= 1 + + def test_medicaid_rate_loads_state_specific(self): + rates = load_take_up_rate("medicaid", 2022) + assert isinstance(rates, dict) + assert len(rates) == 51 # 50 states + DC + for state, rate in rates.items(): + assert 0 < rate <= 1, f"{state}: {rate}" + assert rates["UT"] == 0.53 + assert rates["CO"] == 0.99 + + def test_aca_rate_loads(self): + rate = load_take_up_rate("aca", 2022) + assert 0 < rate <= 1 + + def test_head_start_rate_loads(self): + rate = load_take_up_rate("head_start", 2022) + assert 0 < rate <= 1 + + def test_early_head_start_rate_loads(self): + rate = load_take_up_rate("early_head_start", 2022) + assert 0 < rate <= 1 + + def test_dc_ptc_rate_loads(self): + rate = load_take_up_rate("dc_ptc", 2022) + assert 0 < rate <= 1 + + def test_ssi_pass_rate_loads(self): + rate = load_take_up_rate("ssi_pass_rate", 2022) + assert rate == 0.4 + + +class TestStableStringHash: + + def test_deterministic(self): + h1 = _stable_string_hash("takes_up_snap_if_eligible") + h2 = _stable_string_hash("takes_up_snap_if_eligible") + assert h1 == h2 + + def test_different_strings_differ(self): + h1 = _stable_string_hash("takes_up_snap_if_eligible") + h2 = _stable_string_hash("takes_up_aca_if_eligible") + assert h1 != h2 + + def test_returns_uint64(self): + h = _stable_string_hash("test") + assert h.dtype == np.uint64 + + +class TestSeededRng: + + def test_same_name_same_results(self): + rng1 = seeded_rng("takes_up_snap_if_eligible") + result1 = rng1.random(1000) + rng2 = seeded_rng("takes_up_snap_if_eligible") + result2 = rng2.random(1000) + np.testing.assert_array_equal(result1, result2) + + def test_different_names_different_results(self): + rng1 = seeded_rng("takes_up_snap_if_eligible") + result1 = rng1.random(1000) + rng2 = seeded_rng("takes_up_aca_if_eligible") + result2 = rng2.random(1000) + assert not np.array_equal(result1, result2) + + def test_order_independence(self): + """Generating variables in different order produces same values.""" + # Order A: SNAP then ACA + rng_snap_a = seeded_rng("takes_up_snap_if_eligible") + snap_a = rng_snap_a.random(1000) + rng_aca_a = seeded_rng("takes_up_aca_if_eligible") + aca_a = rng_aca_a.random(1000) + + # Order B: ACA then SNAP + rng_aca_b = seeded_rng("takes_up_aca_if_eligible") + aca_b = rng_aca_b.random(1000) + rng_snap_b = seeded_rng("takes_up_snap_if_eligible") + snap_b = rng_snap_b.random(1000) + + np.testing.assert_array_equal(snap_a, snap_b) + np.testing.assert_array_equal(aca_a, aca_b) + + +class TestTakeUpProportions: + + def test_take_up_produces_expected_proportion(self): + rate = 0.7 + n = 10_000 + rng = seeded_rng("test_variable") + take_up = rng.random(n) < rate + assert abs(take_up.mean() - rate) < 0.05 + + def test_boolean_generation(self): + rng = seeded_rng("test_bool") + take_up = rng.random(100) < 0.5 + assert take_up.dtype == bool + assert set(take_up).issubset({True, False}) + + def test_wic_takeup_rates_load(self): + rates = load_take_up_rate("wic_takeup", 2022) + assert isinstance(rates, dict) + assert rates["PREGNANT"] == 0.456 + assert rates["INFANT"] == 0.784 + assert rates["NONE"] == 0 + + def test_wic_nutritional_risk_rates_load(self): + rates = load_take_up_rate("wic_nutritional_risk", 2022) + assert isinstance(rates, dict) + assert rates["INFANT"] == 0.95 + assert rates["CHILD"] == 0.752 + assert rates["NONE"] == 0 + + def test_wic_category_specific_proportions(self): + rates = load_take_up_rate("wic_takeup", 2022) + n = 10_000 + rng = seeded_rng("would_claim_wic") + draws = rng.random(n) + for category, expected_rate in [ + ("INFANT", 0.784), + ("CHILD", 0.46), + ]: + take_up = draws[:n] < expected_rate + assert abs(take_up.mean() - expected_rate) < 0.05 + + def test_state_specific_medicaid_proportions(self): + rates = load_take_up_rate("medicaid", 2022) + rng = seeded_rng("takes_up_medicaid_if_eligible") + n = 50_000 + draws = rng.random(n) + # Test a few states + for state, expected_rate in [("UT", 0.53), ("CO", 0.99)]: + take_up = draws[:10_000] < expected_rate + assert abs(take_up.mean() - expected_rate) < 0.05 diff --git a/policyengine_us_data/utils/randomness.py b/policyengine_us_data/utils/randomness.py new file mode 100644 index 00000000..eac01522 --- /dev/null +++ b/policyengine_us_data/utils/randomness.py @@ -0,0 +1,30 @@ +import warnings +import numpy as np + + +def _stable_string_hash(s: str) -> np.uint64: + """Deterministic hash consistent across Python processes. + + Python's built-in hash() is not deterministic across processes + (since 3.3), so we use a polynomial rolling hash with mixing. + + Ported from policyengine_core.commons.formulas._stable_string_hash. + """ + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "overflow encountered", RuntimeWarning + ) + h = np.uint64(0) + for byte in s.encode("utf-8"): + h = h * np.uint64(31) + np.uint64(byte) + h = h ^ (h >> np.uint64(33)) + h = h * np.uint64(0xFF51AFD7ED558CCD) + h = h ^ (h >> np.uint64(33)) + return h + + +def seeded_rng(variable_name: str, salt: str = None) -> np.random.Generator: + """Create a per-variable RNG seeded by variable name hash.""" + key = variable_name if salt is None else f"{variable_name}:{salt}" + seed = int(_stable_string_hash(key)) % (2**63) + return np.random.default_rng(seed=seed)