#!/usr/bin/env python3
"""
02_census_explore.py — Load and explore Census 2024 persona data.

IELE756 · Week 0 · March 6, 2026

NOTE: The Census parquet file is NOT included in this repo.
Download it from INE and place it at the path below, or adjust
the path to match your Google Drive / Colab setup.
"""

import pandas as pd

# ── Adjust this path to where you stored the parquet file ──
PARQUET_PATH = "personas_censo2024.parquet"

# ── 1. Load (select columns for speed) ──
persona = pd.read_parquet(
    PARQUET_PATH,
    columns=["region", "comuna", "sexo", "edad",
             "p27_nacionalidad", "p27_nacionalidad_rec",
             "escolaridad", "sit_fuerza_trabajo"],
)
print(f"Total personas: {len(persona):,}")
print(persona.dtypes)

# ── 2. Basic inspection ──
print("\n── Shape ──")
print(persona.shape)

print("\n── First 10 rows ──")
print(persona.head(10))

print("\n── Info ──")
persona.info()

# ── 3. Filter to Tarapacá (region == 1) ──
tarapaca = persona[persona["region"] == 1]
print(f"\nTarapacá: {len(tarapaca):,} personas")

# ── 4. Nationality breakdown ──
# 1 = Chilean only, 2 = Chilean + other, 3 = Foreign, -99 = No response
print("\n── Raw nationality codes ──")
print(tarapaca["p27_nacionalidad"].value_counts())

# ── 5. % foreign-born ──
print("\n── Recoded nationality ──")
print(tarapaca["p27_nacionalidad_rec"].value_counts())

foreign = tarapaca["p27_nacionalidad_rec"].value_counts(normalize=True)
print(f"\n% foreign-born in Tarapacá: {foreign.get('Extranjero', 0):.1%}")
