import pandas as pd from sqlalchemy import create_engine, text import numpy as np # ------------------------------------------------------------ # CONFIG BASE DE DATOS POSTGRES # ------------------------------------------------------------ DB_CONFIG = { "host": "localhost", "port": 5433, "user": "postgres", "password": "postgres", "database": "adicciones" } DB_URI = f"postgresql+psycopg2://{DB_CONFIG['user']}:{DB_CONFIG['password']}@" \ f"{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}" engine = create_engine(DB_URI) # ------------------------------------------------------------ # FUNCIONES AUXILIARES # ------------------------------------------------------------ def read_table(table_name): """Lee una tabla desde Postgres y devuelve un DataFrame limpio.""" with engine.begin() as conn: df = pd.read_sql_query(text(f"SELECT * FROM {table_name};"), conn) df.columns = df.columns.str.lower().str.strip() print(f"šŸ“„ Tabla '{table_name}' cargada: {df.shape[0]} filas, {df.shape[1]} columnas") return df def normalize_year_columns(df): """Renombra y limpia columnas relacionadas con aƱo.""" rename_map = {} for c in df.columns: if any(alias in c for alias in ["a_o", "ano", "anio"]): rename_map[c] = "aƱo" df.rename(columns=rename_map, inplace=True) # eliminar duplicadas df = df.loc[:, ~df.columns.duplicated()] # convertir aƱo a numĆ©rico si es posible if "aƱo" in df.columns: df["aƱo"] = pd.to_numeric(df["aƱo"], errors="coerce") return df def remove_outliers(df): """Elimina valores atĆ­picos de columnas numĆ©ricas (IQR).""" for col in df.select_dtypes(include=np.number).columns: q1, q3 = df[col].quantile([0.25, 0.75]) iqr = q3 - q1 lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr df = df[(df[col].between(lo, hi)) | (df[col].isna())] return df def numeric_summary(df, name): """Devuelve estadĆ­sticas descriptivas.""" num = df.select_dtypes(include=np.number) if num.empty: print(f"āš ļø '{name}' no tiene columnas numĆ©ricas.") return pd.DataFrame() s = num.describe().T s["dataset"] = name return s # ------------------------------------------------------------ # LECTURA Y LIMPIEZA # ------------------------------------------------------------ tables = { "juego": "estadisticas_establecimientos_juego", "prohibidos": "registro_prohibidos_juego", "drogas": "consumo_drogas_alcohol_esp", "condenas": "condenas_sexo_localidad" } frames = {} for name, table in tables.items(): df = read_table(table) df = normalize_year_columns(df) df.replace(["", " ", "Desconocido", "nan", "NaN"], np.nan, inplace=True) df = remove_outliers(df) frames[name] = df # ------------------------------------------------------------ # ANƁLISIS ESTADƍSTICO # ------------------------------------------------------------ summaries = [numeric_summary(df, name) for name, df in frames.items() if not df.empty] if summaries: pd.concat(summaries).to_csv("resumen_estadistico_por_dataset.csv") print("šŸ“Š 'resumen_estadistico_por_dataset.csv' generado correctamente.") else: print("āš ļø No se encontraron columnas numĆ©ricas.") # ------------------------------------------------------------ # COMBINACIƓN POR 'aƱo' # ------------------------------------------------------------ frames_con_aƱo = {k: v for k, v in frames.items() if "aƱo" in v.columns} print("\nšŸ“… Tablas con columna 'aƱo':", list(frames_con_aƱo.keys())) if len(frames_con_aƱo) > 1: merged = None for name, df in frames_con_aƱo.items(): df = df.loc[:, ~df.columns.duplicated()] # eliminar duplicadas antes del merge if merged is None: merged = df else: merged = pd.merge(merged, df, on="aƱo", how="outer", suffixes=("", f"_{name}")) merged = remove_outliers(merged) merged.to_csv("dataset_combinado_por_aƱo.csv", index=False) print(f"āœ… 'dataset_combinado_por_aƱo.csv' generado correctamente ({merged.shape[0]} filas, {merged.shape[1]} columnas).") else: print("āš ļø No hay suficientes datasets con columna 'aƱo' para combinar.")