Aquí se encuentran todas las scripts usadas en la Fase 2 para procesar los datos.
121 lines
4.2 KiB
Python
121 lines
4.2 KiB
Python
import pandas as pd
|
|
from sqlalchemy import create_engine, text
|
|
import numpy as np
|
|
|
|
# ------------------------------------------------------------
|
|
# CONFIG BASE DE DATOS POSTGRES
|
|
# ------------------------------------------------------------
|
|
DB_CONFIG = {
|
|
"host": "localhost",
|
|
"port": 5433,
|
|
"user": "postgres",
|
|
"password": "postgres",
|
|
"database": "adicciones"
|
|
}
|
|
|
|
DB_URI = f"postgresql+psycopg2://{DB_CONFIG['user']}:{DB_CONFIG['password']}@" \
|
|
f"{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
|
|
|
|
engine = create_engine(DB_URI)
|
|
|
|
# ------------------------------------------------------------
|
|
# FUNCIONES AUXILIARES
|
|
# ------------------------------------------------------------
|
|
def read_table(table_name):
|
|
"""Lee una tabla desde Postgres y devuelve un DataFrame limpio."""
|
|
with engine.begin() as conn:
|
|
df = pd.read_sql_query(text(f"SELECT * FROM {table_name};"), conn)
|
|
df.columns = df.columns.str.lower().str.strip()
|
|
print(f"📥 Tabla '{table_name}' cargada: {df.shape[0]} filas, {df.shape[1]} columnas")
|
|
return df
|
|
|
|
|
|
def normalize_year_columns(df):
|
|
"""Renombra y limpia columnas relacionadas con año."""
|
|
rename_map = {}
|
|
for c in df.columns:
|
|
if any(alias in c for alias in ["a_o", "ano", "anio"]):
|
|
rename_map[c] = "año"
|
|
df.rename(columns=rename_map, inplace=True)
|
|
|
|
# eliminar duplicadas
|
|
df = df.loc[:, ~df.columns.duplicated()]
|
|
|
|
# convertir año a numérico si es posible
|
|
if "año" in df.columns:
|
|
df["año"] = pd.to_numeric(df["año"], errors="coerce")
|
|
|
|
return df
|
|
|
|
|
|
def remove_outliers(df):
|
|
"""Elimina valores atípicos de columnas numéricas (IQR)."""
|
|
for col in df.select_dtypes(include=np.number).columns:
|
|
q1, q3 = df[col].quantile([0.25, 0.75])
|
|
iqr = q3 - q1
|
|
lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
df = df[(df[col].between(lo, hi)) | (df[col].isna())]
|
|
return df
|
|
|
|
|
|
def numeric_summary(df, name):
|
|
"""Devuelve estadísticas descriptivas."""
|
|
num = df.select_dtypes(include=np.number)
|
|
if num.empty:
|
|
print(f"⚠️ '{name}' no tiene columnas numéricas.")
|
|
return pd.DataFrame()
|
|
s = num.describe().T
|
|
s["dataset"] = name
|
|
return s
|
|
|
|
|
|
# ------------------------------------------------------------
|
|
# LECTURA Y LIMPIEZA
|
|
# ------------------------------------------------------------
|
|
tables = {
|
|
"juego": "estadisticas_establecimientos_juego",
|
|
"prohibidos": "registro_prohibidos_juego",
|
|
"drogas": "consumo_drogas_alcohol_esp",
|
|
"condenas": "condenas_sexo_localidad"
|
|
}
|
|
|
|
frames = {}
|
|
for name, table in tables.items():
|
|
df = read_table(table)
|
|
df = normalize_year_columns(df)
|
|
df.replace(["", " ", "Desconocido", "nan", "NaN"], np.nan, inplace=True)
|
|
df = remove_outliers(df)
|
|
frames[name] = df
|
|
|
|
# ------------------------------------------------------------
|
|
# ANÁLISIS ESTADÍSTICO
|
|
# ------------------------------------------------------------
|
|
summaries = [numeric_summary(df, name) for name, df in frames.items() if not df.empty]
|
|
if summaries:
|
|
pd.concat(summaries).to_csv("resumen_estadistico_por_dataset.csv")
|
|
print("📊 'resumen_estadistico_por_dataset.csv' generado correctamente.")
|
|
else:
|
|
print("⚠️ No se encontraron columnas numéricas.")
|
|
|
|
# ------------------------------------------------------------
|
|
# COMBINACIÓN POR 'año'
|
|
# ------------------------------------------------------------
|
|
frames_con_año = {k: v for k, v in frames.items() if "año" in v.columns}
|
|
|
|
print("\n📅 Tablas con columna 'año':", list(frames_con_año.keys()))
|
|
|
|
if len(frames_con_año) > 1:
|
|
merged = None
|
|
for name, df in frames_con_año.items():
|
|
df = df.loc[:, ~df.columns.duplicated()] # eliminar duplicadas antes del merge
|
|
if merged is None:
|
|
merged = df
|
|
else:
|
|
merged = pd.merge(merged, df, on="año", how="outer", suffixes=("", f"_{name}"))
|
|
|
|
merged = remove_outliers(merged)
|
|
merged.to_csv("dataset_combinado_por_año.csv", index=False)
|
|
print(f"✅ 'dataset_combinado_por_año.csv' generado correctamente ({merged.shape[0]} filas, {merged.shape[1]} columnas).")
|
|
else:
|
|
print("⚠️ No hay suficientes datasets con columna 'año' para combinar.")
|